# Movie Ratings Dataset
This Python script process and convert the Netflix prize dataset's combined_data_1.txt file into a structured pandas DataFrame. The dataset is structured such that each section starts with a movie ID followed by a colon, and subsequent lines provide customer ratings for that movie, including the customer ID, the rating given, and the rating date.

The script defines a function read_netflix_ratings, which reads the dataset line by line. It distinguishes between lines that represent movie IDs and those that represent user ratings.

MovieID, customer ID, rating and the rating date are compiled into a list which is then transformed into a DataFrame with columns for the movie ID, customer ID, rating, and date.

In [1]:
import pandas as pd

# Function to process the combined_data_1 file
def read_netflix_ratings(file_path):
    data = []  # List to store each rating as a list
    current_movie_id = None  # Variable to keep track of the current movie ID
    with open(file_path, 'r') as file:
        for line in file:
            if line.endswith(':\n'):  # This line is a movie ID
                current_movie_id = line[:-2]  # Remove the colon and newline to get the ID
            else:  # This line is a rating
                customer_id, rating, date = line.strip().split(',')
                data.append([current_movie_id, customer_id, rating, date])
    return pd.DataFrame(data, columns=['MovieID', 'CustomerID', 'Rating', 'Date'])

# Path to the combined_data_1 file
ratings_file_path = 'C:/Users/nafla/OneDrive/Documents/system development/Netflix/combined_data_1.txt'
ratings_df = read_netflix_ratings(ratings_file_path)


In [2]:
ratings_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


# Movie Title Dataset

This Python script processes the movie_titles.csv file from the Netflix prize dataset, creating a structured pandas DataFrame that includes movie IDs and their respective years of release. The dataset is assumed to have lines formatted with at least two pieces of information per movie: the movie ID and the year of release, possibly followed by the movie title, which this script ignores for simplicity.
As it iterates over each line, the function splits the line into components based on commas, specifically designed to split into at most three parts to ensure the title, which may contain commas, is not split.

In [3]:
def read_movie_titles(file_path):
    data = []  # Initialize an empty list to store the data
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            parts = line.strip().split(',', 2)  # Split each line, expecting at least 2 parts
            if len(parts) >= 2:  # Check if we have at least two parts (MovieID and Year)
                movie_id, year_of_release = parts[:2]  # Extract MovieID and YearOfRelease
                data.append([movie_id, year_of_release])
            else:
                print(f"Skipping malformed line: {line.strip()}")
    # Convert the list to a DataFrame with only MovieID and YearOfRelease
    return pd.DataFrame(data, columns=['MovieID', 'YearOfRelease'])

movie_titles_path = 'C:/Users/nafla/OneDrive/Documents/system development/Netflix/movie_titles.csv'
movies_df = read_movie_titles(movie_titles_path)

# Convert MovieID and YearOfRelease to the appropriate data types
movies_df['MovieID'] = movies_df['MovieID'].astype(int)
# Direct conversion to integers, setting errors='coerce' to handle non-integer values by converting them to NaN
movies_df['YearOfRelease'] = pd.to_numeric(movies_df['YearOfRelease'], errors='coerce').astype('Int64')
# Note: Using 'Int64' (capital "I") to allow for NaN values in an integer column

# Display the first few rows of the DataFrame
movies_df.head()


Unnamed: 0,MovieID,YearOfRelease
0,1,2003
1,2,2004
2,3,1997
3,4,1994
4,5,2004


# Combine Datasets

In [4]:
# Ensure that MovieID is the correct data type in both DataFrames
ratings_df['MovieID'] = ratings_df['MovieID'].astype(int)
movies_df['MovieID'] = movies_df['MovieID'].astype(int)

# Create a dictionary from movies_df mapping MovieID to YearOfRelease
year_of_release_dict = movies_df.set_index('MovieID')['YearOfRelease'].to_dict()

# Use the map function to create a new column in ratings_df for YearOfRelease
ratings_df['YearOfRelease'] = ratings_df['MovieID'].map(year_of_release_dict)

# Display the first few rows to verify the new column has been added correctly
ratings_df.head()


Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease
0,1,1488844,3,2005-09-06,2003
1,1,822109,5,2005-05-13,2003
2,1,885013,4,2005-10-19,2003
3,1,30878,4,2005-12-26,2003
4,1,823519,3,2004-05-03,2003


In [None]:
# Print out the data types and counts
ratings_df.info()

# Calculate the percentage of missing values for each column
missing_percentage = ratings_df.isnull().mean() * 100

# Display the missing percentages
print(missing_percentage)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24053764 entries, 0 to 24053763
Data columns (total 5 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   MovieID        int32 
 1   CustomerID     object
 2   Rating         object
 3   Date           object
 4   YearOfRelease  object
dtypes: int32(1), object(4)
memory usage: 825.8+ MB
MovieID          0.000000
CustomerID       0.000000
Rating           0.000000
Date             0.000000
YearOfRelease    0.000786
dtype: float64


In [None]:
# Check for duplicate rows
duplicate_rows = ratings_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [None]:
# Calculate the number of unique values in each column
unique_values_count = ratings_df.nunique()

# Display the number of unique values for each column
print(unique_values_count)

MovieID            4499
CustomerID       470758
Rating                5
Date               2182
YearOfRelease        89
dtype: int64


In [6]:

# Change the CustomerID data type as integer
ratings_df['CustomerID'] = ratings_df['CustomerID'].astype(int)

# Change the Rating data type as integer
ratings_df['Rating'] = ratings_df['Rating'].astype(int)

# Change the Date data type as date time
ratings_df['Date'] = pd.to_datetime(ratings_df['Date'], format='%Y-%m-%d')

# Convert YearOfRelease to pandas nullable integer data type
ratings_df['YearOfRelease'] = ratings_df['YearOfRelease'].astype('Int64')


In [13]:
# Drop rows where YearOfRelease is missing
ratings_df = ratings_df.dropna(subset=['YearOfRelease'])

In [8]:

# Calculate the year from the rating date
ratings_df['RatingYear'] = ratings_df['Date'].dt.year

# Calculate the age of the movie at the time of the rating
ratings_df['MovieAge'] = ratings_df['RatingYear'] - ratings_df['YearOfRelease']

In [14]:
# checing the changes
ratings_df.info()
# Calculate the percentage of missing values for each column
missing_percentage = ratings_df.isnull().mean() * 100
# Display the missing percentages
print(missing_percentage)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24053575 entries, 0 to 24053763
Data columns (total 7 columns):
 #   Column         Dtype         
---  ------         -----         
 0   MovieID        int32         
 1   CustomerID     int32         
 2   Rating         int32         
 3   Date           datetime64[ns]
 4   YearOfRelease  Int64         
 5   RatingYear     int64         
 6   MovieAge       Int64         
dtypes: Int64(2), datetime64[ns](1), int32(3), int64(1)
memory usage: 1.2 GB
MovieID          0.0
CustomerID       0.0
Rating           0.0
Date             0.0
YearOfRelease    0.0
RatingYear       0.0
MovieAge         0.0
dtype: float64


In [15]:
ratings_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [16]:
# Saving final data frame as CSV
ratings_df.to_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv', index=False)