In [1]:
# src/data_processing.py
import pandas as pd
import numpy as np
import os


In [5]:
DATA_PATH = os.path.join("..", "data", "ml-25m")


def load_datasets():
    """Load ML-25M datasets safely using chunked reading."""
    print("Loading movies.csv ...")
    movies = pd.read_csv(os.path.join(DATA_PATH, "movies.csv"))
    
    print("Loading tags.csv ...")
    tags = pd.read_csv(os.path.join(DATA_PATH, "tags.csv"))
    
    print("Loading ratings.csv in chunks ... this may take a few minutes")
    chunks = []
    
    for chunk in pd.read_csv(os.path.join(DATA_PATH, "ratings.csv"),
                             chunksize=1_000_000):  # 1 million rows at a time
        chunks.append(chunk)

    ratings = pd.concat(chunks, ignore_index=True)
    print("Ratings loaded successfully!")
    
    return ratings, movies, tags


In [3]:
def preprocess_data(ratings, movies):
    """Merge ratings and movie data, remove missing values."""
    df = pd.merge(ratings, movies, on="movieId")
    df.dropna(inplace=True)

    # Optional: remove duplicates and reset index
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)

    print("✅ Data Preprocessing Completed!")
    print(df.head())

    return df

In [6]:


ratings, movies, tags = load_datasets()
processed_df = preprocess_data(ratings, movies)

Loading movies.csv ...
Loading tags.csv ...
Loading ratings.csv in chunks ... this may take a few minutes
Ratings loaded successfully!
✅ Data Preprocessing Completed!
   userId  movieId  rating   timestamp  \
0       1      296     5.0  1147880044   
1       1      306     3.5  1147868817   
2       1      307     5.0  1147868828   
3       1      665     5.0  1147878820   
4       1      899     3.5  1147868510   

                                              title  \
0                               Pulp Fiction (1994)   
1  Three Colors: Red (Trois couleurs: Rouge) (1994)   
2  Three Colors: Blue (Trois couleurs: Bleu) (1993)   
3                                Underground (1995)   
4                        Singin' in the Rain (1952)   

                        genres  
0  Comedy|Crime|Drama|Thriller  
1                        Drama  
2                        Drama  
3             Comedy|Drama|War  
4       Comedy|Musical|Romance  


In [7]:
processed_df.head()
processed_df.info()
processed_df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(3), object(2)
memory usage: 1.1+ GB


Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0
