# Movie Lens Data Prep

In [80]:
import os

import pandas as pd
from sklearn import preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [81]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/philipredford-
[nltk_data]     jones/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/philipredford-
[nltk_data]     jones/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load the data

In [82]:
# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Construct the data path
data_path = os.path.join(parent_dir, 'data')

In [83]:
# Load the data
movies_df = pd.read_csv(os.path.join(data_path, 'movies.csv'))
ratings_df = pd.read_csv(os.path.join(data_path, 'ratings.csv'))
tags_df = pd.read_csv(os.path.join(data_path, 'tags.csv'))

In [84]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [85]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [86]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Merge the data

In [102]:
df = pd.merge(ratings_df, movies_df, on='movieId', how='left')

In [103]:
df = pd.merge(df, tags_df, on=['movieId', 'userId'], how='left')

In [104]:
df.drop(columns=['timestamp_y', 'timestamp_x'], inplace=True)

## Explore the data

### Explpre the tag data. 

Generate a list of tags that have been used by more than two users Group by 'tag' and count distinct 'userId'.

Generally if the tag is only used by a few users, it is not an informative tag, as it is not a common tag.

In [10]:
# Group by 'tag' and count distinct 'userId'
tag_user_counts = df.groupby('tag')['userId'].nunique()

# Filter tags where more than two users have used the same tag
filtered_tags = tag_user_counts[tag_user_counts > 2]

print(filtered_tags)

tag
Action                3
Adam Sandler          3
Al Pacino             3
Atmospheric           4
Ben Stiller           3
                     ..
visually appealing    6
visually stunning     3
will ferrell          3
witty                 5
zombies               3
Name: userId, Length: 112, dtype: int64


In [16]:
# Sample data
tags = df['tag'].fillna('')  # Fill NaN with empty string

# Convert to lowercase
tags = tags.str.lower()

# Remove punctuation
tags = tags.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
tags = tags.apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))


For simplicity we will only use the genres data for now.

### Explore the genres data

In [105]:
# Split the genres by '|'
df['genres_split'] = df['genres'].str.split('|')

# Explode the list to create a new row for each genre
genres_exploded = df.explode('genres_split')

# Get the unique genres
unique_genres = genres_exploded['genres_split'].unique()

print(unique_genres)

['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Action'
 'Crime' 'Thriller' 'Mystery' 'Horror' 'Drama' 'War' 'Western' 'Sci-Fi'
 'Musical' 'Film-Noir' 'IMAX' 'Documentary' '(no genres listed)']


In [106]:
genre_dummies = df['genres'].str.get_dummies(sep='|')

In [107]:
df = pd.concat([df, genre_dummies], axis=1)

In [108]:
df.drop(columns=['genres', 'genres_split', '(no genres listed)', 'tag'], inplace=True)

In [109]:
df.head()

Unnamed: 0,userId,movieId,rating,title,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,Toy Story (1995),0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,Grumpier Old Men (1995),0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,Heat (1995),1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,"Usual Suspects, The (1995)",0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


Move the rating column to the end of the dataframe and rename it to label

In [110]:
df.rename(columns={'rating': 'label'}, inplace=True)
cols = [col for col in df.columns if col != 'label'] + ['label']
df = df[cols]

## Convert the userId and movieId fields into numerical fields

In [111]:
le_user = preprocessing.LabelEncoder()
le_movie = preprocessing.LabelEncoder()
df.userId = le_user.fit_transform(df.userId.values)
df.movieId = le_movie.fit_transform(df.movieId.values)

## Fix the columns names

In [112]:
df.columns = [col.replace(' ', '_') for col in df.columns]
df.columns = [col.replace('-', '_') for col in df.columns]
df.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'}, inplace=True)
df.columns = [col.lower() for col in df.columns]

In [114]:
df.to_csv(os.path.join(data_path, 'all_movies_data.csv'), index=False)
df.drop(columns=['title'], inplace=True)

## Split the data into training and testing sets

In [74]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

## Save the data

In [75]:
df_train.to_csv(os.path.join(data_path, 'df_train.csv'), index=False)
df_test.to_csv(os.path.join(data_path, 'df_test.csv'), index=False)