# PreProcessing

## Imports

In [1]:
import pandas as pd
from pathlib import Path


## Load Merged Dataset

In [2]:
# --- Load merged dataset from previous notebook ---
raw_data_dir = Path("raw_data")
processed_data_dir = Path("processed_data")

# Reload datasets (or ideally, save full_data as a CSV in 01_data_setup)
ratings = pd.read_csv(
    raw_data_dir / "u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)

movies = pd.read_csv(
    raw_data_dir / "u.item",
    sep="|",
    encoding="latin-1",
    header=None,
    usecols=[0, 1],
    names=["item_id", "title"]
)

users = pd.read_csv(
    raw_data_dir / "u.user",
    sep="|",
    encoding="latin-1",
    header=None,
    names=["user_id", "age", "gender", "occupation", "zip_code"]
)

# Merge again
ratings_movies = ratings.merge(movies, on="item_id")
full_data = ratings_movies.merge(users, on="user_id")

full_data.head()


Unnamed: 0,user_id,item_id,rating,timestamp,title,age,gender,occupation,zip_code
0,196,242,3,881250949,Kolya (1996),49,M,writer,55105
1,186,302,3,891717742,L.A. Confidential (1997),39,F,executive,0
2,22,377,1,878887116,Heavyweights (1994),25,M,writer,40206
3,244,51,2,880606923,Legends of the Fall (1994),28,M,technician,80525
4,166,346,1,886397596,Jackie Brown (1997),47,M,educator,55113


## Drop Unnecessary Columns

In [3]:
data = full_data.drop(columns=["timestamp", "zip_code"])

data.head()


Unnamed: 0,user_id,item_id,rating,title,age,gender,occupation
0,196,242,3,Kolya (1996),49,M,writer
1,186,302,3,L.A. Confidential (1997),39,F,executive
2,22,377,1,Heavyweights (1994),25,M,writer
3,244,51,2,Legends of the Fall (1994),28,M,technician
4,166,346,1,Jackie Brown (1997),47,M,educator


## Handle Missing Values

In [4]:
print(data.isnull().sum())

# MovieLens 100k dataset should have no NaN, but if so, drop them
data = data.dropna()


user_id       0
item_id       0
rating        0
title         0
age           0
gender        0
occupation    0
dtype: int64


## Encode Categorical Features

In [5]:
# --- Encode gender (M/F → 0/1) ---
data["gender"] = data["gender"].map({"M": 0, "F": 1})

# --- Encode occupation ---
data["occupation"] = data["occupation"].astype("category").cat.codes

data.head()


Unnamed: 0,user_id,item_id,rating,title,age,gender,occupation
0,196,242,3,Kolya (1996),49,0,20
1,186,302,3,L.A. Confidential (1997),39,1,6
2,22,377,1,Heavyweights (1994),25,0,20
3,244,51,2,Legends of the Fall (1994),28,0,19
4,166,346,1,Jackie Brown (1997),47,0,3


## Save Processed Dataset

In [6]:
processed_file = processed_data_dir / "movielens_processed.csv"
data.to_csv(processed_file, index=False)

print("✔️ Processed dataset saved to:", processed_file)


✔️ Processed dataset saved to: processed_data\movielens_processed.csv


## Quick Check

In [7]:
# --- Verify processed dataset ---
pd.read_csv(processed_file).head()


Unnamed: 0,user_id,item_id,rating,title,age,gender,occupation
0,196,242,3,Kolya (1996),49,0,20
1,186,302,3,L.A. Confidential (1997),39,1,6
2,22,377,1,Heavyweights (1994),25,0,20
3,244,51,2,Legends of the Fall (1994),28,0,19
4,166,346,1,Jackie Brown (1997),47,0,3
