# Data SetUp

## Imports

In [12]:
import os
import shutil
from pathlib import Path
import kagglehub
import pandas as pd

## Setup Directories & Dataset Download

In [11]:
# --- Directories ---
raw_data_dir = Path("raw_data")
processed_data_dir = Path("processed_data")

# Create directories if not exist
raw_data_dir.mkdir(exist_ok=True)
processed_data_dir.mkdir(exist_ok=True)

# File path for raw dataset (main ratings file in MovieLens)
raw_dataset = raw_data_dir / "u.data"

# Check and download dataset if not already available
if raw_dataset.exists():
    print(" Dataset is already downloaded.")
else:
    dataset_path = Path(kagglehub.dataset_download("prajitdatta/movielens-100k-dataset"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Search inside subfolders and copy all relevant files
    for item in dataset_path.rglob("*"):
        if item.is_file() and item.suffix in [".csv", ".data", ".item", ".user", ".genre"]:
            target = raw_data_dir / item.name
            shutil.copy2(item, target)

    print(" Dataset successfully downloaded to:", raw_data_dir)


 Dataset is already downloaded.


## Verify Files

In [10]:
# --- Inspect raw_data contents ---
list(raw_data_dir.iterdir())


[WindowsPath('raw_data/u.data'),
 WindowsPath('raw_data/u.genre'),
 WindowsPath('raw_data/u.item'),
 WindowsPath('raw_data/u.user')]

## Load Ratings Data

In [13]:
ratings_path = raw_data_dir / "u.data"

# The MovieLens 100k dataset has no headers, we must add them
ratings = pd.read_csv(
    ratings_path,
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)

ratings.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Load Movie Metadata

In [14]:
movies_path = raw_data_dir / "u.item"

# MovieLens item file has '|' delimiter
movies = pd.read_csv(
    movies_path,
    sep="|",
    encoding="latin-1",
    header=None,
    usecols=[0, 1],
    names=["item_id", "title"]
)

movies.head()


Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


## Load Users Data

In [15]:
users_path = raw_data_dir / "u.user"

users = pd.read_csv(
    users_path,
    sep="|",
    encoding="latin-1",
    header=None,
    names=["user_id", "age", "gender", "occupation", "zip_code"]
)

users.head()


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## Merge Datasets

In [16]:
ratings_movies = ratings.merge(movies, on="item_id")
full_data = ratings_movies.merge(users, on="user_id")

full_data.head()


Unnamed: 0,user_id,item_id,rating,timestamp,title,age,gender,occupation,zip_code
0,196,242,3,881250949,Kolya (1996),49,M,writer,55105
1,186,302,3,891717742,L.A. Confidential (1997),39,F,executive,0
2,22,377,1,878887116,Heavyweights (1994),25,M,writer,40206
3,244,51,2,880606923,Legends of the Fall (1994),28,M,technician,80525
4,166,346,1,886397596,Jackie Brown (1997),47,M,educator,55113


## Basic Exploration

In [17]:
print("Number of ratings:", len(ratings))
print("Number of users:", ratings['user_id'].nunique())
print("Number of movies:", ratings['item_id'].nunique())

full_data.info()


Number of ratings: 100000
Number of users: 943
Number of movies: 1682
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     100000 non-null  int64 
 1   item_id     100000 non-null  int64 
 2   rating      100000 non-null  int64 
 3   timestamp   100000 non-null  int64 
 4   title       100000 non-null  object
 5   age         100000 non-null  int64 
 6   gender      100000 non-null  object
 7   occupation  100000 non-null  object
 8   zip_code    100000 non-null  object
dtypes: int64(5), object(4)
memory usage: 6.9+ MB


## Save merged dataset

In [18]:
processed_data_dir = Path("processed_data")
processed_data_dir.mkdir(exist_ok=True)

merged_path = processed_data_dir / "merged_ratings.csv"
full_data.to_csv(merged_path, index=False)
print("✔️ Merged dataset saved to:", merged_path)


✔️ Merged dataset saved to: processed_data\merged_ratings.csv
