# Prepocessing and Feature Engineering Notebook

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
from pathlib import Path

DATA_DIR = Path('../data/ml-latest-small')

ratings = pd.read_csv(DATA_DIR / 'ratings.csv')
movies   = pd.read_csv(DATA_DIR / 'movies.csv')
links    = pd.read_csv(DATA_DIR / 'links.csv')
tags     = pd.read_csv(DATA_DIR / 'tags.csv')

print("Loaded shapes:")
print(f"ratings: {ratings.shape}")
print(f"movies:  {movies.shape}")
print(f"links:   {links.shape}")
print(f"tags:    {tags.shape}")


Loaded shapes:
ratings: (100836, 4)
movies:  (9742, 3)
links:   (9742, 3)
tags:    (3683, 4)


## Basic Cleaning & Consistency Checks

In [6]:
# Ratings: ensure no weird values
print("Rating range:", ratings['rating'].min(), "to", ratings['rating'].max())
ratings = ratings[ratings['rating'].between(0.5, 5.0)]  # should already be true

#. Movies: handle "(no genres listed)"
movies['genres'] = movies['genres'].replace('(no genres listed)', 'Unknown')
print("Movies with no genres:", (movies['genres'] == 'Unknown').sum())

#  Tags: lowercase + strip
tags['tag'] = tags['tag'].str.lower().str.strip()

#  Drop exact duplicate tags (same user + movie + tag),  incase of double tagging.
tags = tags.drop_duplicates(subset=['userId', 'movieId', 'tag'])
print("Tags after dedup:", tags.shape)

#  Links: keep only rows where tmdbId exists (for future TMDB pull)
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)
print("Movies with TMDB ID:", len(links))

Rating range: 0.5 to 5.0
Movies with no genres: 34
Tags after dedup: (3683, 4)
Movies with TMDB ID: 9734


# Clean & merge all datasets

In [14]:
# Merge ratings + movies
df = ratings.merge(movies, on='movieId', how='left')

  # add average rating per movie (useful later)
movie_avg_rating = df.groupby('movieId')['rating'].mean().rename('avg_rating')
movie_rating_count = df.groupby('movieId')['rating'].count().rename('rating_count')

movies_enriched = movies.merge(movie_avg_rating, on='movieId', how='left')
movies_enriched = movies_enriched.merge(movie_rating_count, on='movieId', how='left')
movies_enriched = movies_enriched.merge(links[['movieId', 'tmdbId']], on='movieId', how='left')

print("Enriched movies shape:", movies_enriched.shape)


Enriched movies shape: (9742, 6)


In [17]:
movies_enriched.to_csv('../data/ml-latest-small/processed/movies_enriched.csv', index=False)

# Content-Based Feature Engineering

In [34]:
# Group tags by movie â†’ join them into a single string
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.unique())).reset_index()
movie_tags = movie_tags.rename(columns={'tag': 'all_tags'})
# Merge tags into movies
movies_content = movies_enriched.merge(movie_tags, on='movieId', how='left')
movies_content['all_tags'] = movies_content['all_tags'].fillna('')  

# Create combined text feature
movies_content['content_text'] = (
    movies_content['genres'].str.replace('|', '', regex=False) + ' ' +
    movies_content['title'].str.replace(r'\(\d{4}\)','  ', regex=True).str.strip() + ' ' +  # remove year from title
    movies_content['all_tags']
).str.lower().str.strip()

# Preview
print(movies_content[['title', 'genres', 'all_tags', 'content_text']].head(5))

                                title  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                       all_tags  \
0                                     pixar fun   
1  fantasy magic board game robin williams game   
2                                     moldy old   
3                                                 
4                              pregnancy remake   

                                        content_text  
0  adventureanimationchildrencomedyfantasy toy st...  
1  adventurechildrenfantasy jumanji

# Vectorize for Content-Based Similarity 
 Using TF-IDF (great for sparse text)

In [3]:
# TF-IDF on the combined text
tfidf = TfidfVectorizer(
    max_features=5000,          
    stop_words='english',
    ngram_range=(1,2)           # unigrams + bigrams for better phrases
)

tfidf_matrix = tfidf.fit_transform(movies_content['content_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)  # (n_movies, n_features)


import scipy.sparse
scipy.sparse.save_npz('../data/ml-latest-small/processed/tfidf_matrix.npz', tfidf_matrix)
movies_content.to_csv('../data/ml-latest-small/processed/movies_content.csv', index=False)

NameError: name 'TfidfVectorizer' is not defined

NameError: name 'movies_content' is not defined

# Collaborative Filtering Prep (for Surprise)

In [39]:

collab_data = ratings[['userId', 'movieId', 'rating']]

# filter out very rare movies/users 
min_ratings_per_movie = 4
min_ratings_per_user  = 4

valid_movies = collab_data['movieId'].value_counts()[lambda x: x >= min_ratings_per_movie].index
valid_users  = collab_data['userId'].value_counts()[lambda x: x >= min_ratings_per_user].index

collab_data_filtered = collab_data[
    collab_data['movieId'].isin(valid_movies) &
    collab_data['userId'].isin(valid_users)
]

print("Original ratings:", len(ratings))
print("After min ratings filter:", len(collab_data_filtered))

# Save
collab_data_filtered.to_csv('../data/ml-latest-small/processed/collab_ratings.csv', index=False)

Original ratings: 100836
After min ratings filter: 92394
