In [1]:
import pandas as pd

# Define the filepath with folder
# filepath = "grouplensdata/"

# Read the CSV files into pandas DataFrames
ratings_df = pd.read_csv(filepath + "ratings.csv")
tags_df = pd.read_csv(filepath + "tags.csv")
movies_df = pd.read_csv(filepath + "movies.csv")
links_df = pd.read_csv(filepath + "links.csv")

# Merge movies_df and links_df on movieId
movies_links_df = pd.merge(movies_df, links_df, on="movieId")

# Merge ratings_df, tags_df, and movies_links_df
combined_df = pd.merge(pd.merge(ratings_df, tags_df, on=["userId", "movieId"]), movies_links_df, on="movieId")
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,imdbId,tmdbId
0,10,260,4.5,1430666645,good vs evil,1430666558,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0
1,10,260,4.5,1430666645,Harrison Ford,1430666505,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0
2,10,260,4.5,1430666645,sci-fi,1430666538,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0
3,78,260,5.0,1443241370,influential,1443241425,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0
4,78,260,5.0,1443241370,space epic,1443241436,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0


In [2]:
# Drop imdbId and tmdbId columns from combined_df
combined_df.drop(columns=["imdbId", "tmdbId"], inplace=True)
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,10,260,4.5,1430666645,good vs evil,1430666558,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1,10,260,4.5,1430666645,Harrison Ford,1430666505,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
2,10,260,4.5,1430666645,sci-fi,1430666538,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
3,78,260,5.0,1443241370,influential,1443241425,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,78,260,5.0,1443241370,space epic,1443241436,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi


In [3]:
# Extract the year of release from the movie titles
# In this code, we use the str.extract method along with a regular expression pattern \((\d{4})\) 
# to extract the year of release from the movie titles. 
combined_df['Year'] = combined_df['title'].str.extract(r'\((\d{4})\)')
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,Year
0,10,260,4.5,1430666645,good vs evil,1430666558,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
1,10,260,4.5,1430666645,Harrison Ford,1430666505,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
2,10,260,4.5,1430666645,sci-fi,1430666538,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
3,78,260,5.0,1443241370,influential,1443241425,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
4,78,260,5.0,1443241370,space epic,1443241436,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977


In [5]:
# Convert the 'Year' column to numeric
combined_df['Year'] = pd.to_numeric(combined_df['Year'])

# Filter the DataFrame to include only movies released in 2000 or later
filtered_df = combined_df[combined_df['Year'] >= 2000]
filtered_df.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,Year
7663,14,58559,5.0,1311530004,Atmospheric,1311530439,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0
7664,14,58559,5.0,1311530004,Batman,1311530391,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0
7665,14,58559,5.0,1311530004,comic book,1311530398,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0
7666,14,58559,5.0,1311530004,dark,1311530428,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0
7667,14,58559,5.0,1311530004,Heath Ledger,1311530404,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0


In [8]:
unique_values = filtered_df['Year'].unique()

# Print the unique values
print(unique_values)

[2008. 2007. 2000. 2001. 2002. 2003. 2004. 2005. 2006. 2009. 2010. 2012.
 2011. 2013. 2014. 2015. 2017. 2016. 2018. 2019. 2021. 2020. 2022. 2023.]


In [9]:
import os

# Get the current working directory
current_dir = os.getcwd()

# Save the filtered_df DataFrame as CSV in the current directory
filtered_df.to_csv(os.path.join(current_dir, "GroupLensData_clean.csv"), index=False)