# Movie Rating Data Prep

Clean a movie rating dataset by handling missing ratings and
ensuring movie titles are consistent.

Import Library

In [1]:
import pandas as pd
import numpy as np

# Load sample Datas

In [None]:

# Let's create two sample DataFrames to simulate a real-world scenario.
# ratings_df: Contains user ratings for movies, with some missing values.
ratings_df = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
    'movie_title': ['Asuran', 'Thangamagan', 'Asuran', 'Sullan', 'Thangamagan', np.nan, 'sullan', 'Thangamagan'],
    'rating': [5, 4, 5, np.nan, 4, 3, 5, 5]
})

# movie_info_df: Contains consistent movie information.
movie_info_df = pd.DataFrame({
    'clean_movie_title': ['Asuran', 'Thangamagan', 'sullan', 'Kutty'],
    'genre': ['Comdey', 'Comedy', 'Action', 'Comedy']
})

print("### Initial Ratings DataFrame")
print(ratings_df)
print("\n### Movie Info DataFrame")
print(movie_info_df)


### Initial Ratings DataFrame
   user_id  movie_title  rating
0        1       Asuran     5.0
1        1  Thangamagan     4.0
2        2       Asuran     5.0
3        2       Sullan     NaN
4        3  Thangamagan     4.0
5        3          NaN     3.0
6        4       sullan     5.0
7        4  Thangamagan     5.0

### Movie Info DataFrame
  clean_movie_title   genre
0            Asuran  Comdey
1       Thangamagan  Comedy
2            sullan  Action
3             Kutty  Comedy


# step 1: Handle Missing Ratings

In [3]:
# Calculate the median rating
median_rating = ratings_df['rating'].median()

# Fill missing 'rating' values with the median
ratings_df['rating'] = ratings_df['rating'].fillna(median_rating)

print("### Ratings DataFrame after filling missing ratings with median:")
print(ratings_df)


### Ratings DataFrame after filling missing ratings with median:
   user_id  movie_title  rating
0        1       Asuran     5.0
1        1  Thangamagan     4.0
2        2       Asuran     5.0
3        2       Sullan     5.0
4        3  Thangamagan     4.0
5        3          NaN     3.0
6        4       sullan     5.0
7        4  Thangamagan     5.0


# Step 2: Standardize Movie Titles

In [4]:
# Convert titles to lowercase and strip whitespace
ratings_df['movie_title'] = ratings_df['movie_title'].str.lower().str.strip()

# Replace missing movie titles with 'Unknown' to avoid errors during the merge
ratings_df['movie_title'] = ratings_df['movie_title'].fillna('unknown')

print("### Ratings DataFrame after standardizing movie titles:")
print(ratings_df)


### Ratings DataFrame after standardizing movie titles:
   user_id  movie_title  rating
0        1       asuran     5.0
1        1  thangamagan     4.0
2        2       asuran     5.0
3        2       sullan     5.0
4        3  thangamagan     4.0
5        3      unknown     3.0
6        4       sullan     5.0
7        4  thangamagan     5.0


# Step 3: Merge with Movie Info Dataset

In [5]:
# Merge the two dataframes on the movie title columns
merged_df = pd.merge(ratings_df, movie_info_df, left_on='movie_title', right_on='clean_movie_title', how='left')

# Drop the redundant 'clean_movie_title' column
merged_df.drop('clean_movie_title', axis=1, inplace=True)

print("### Final Merged and Cleaned DataFrame:")
print(merged_df)


### Final Merged and Cleaned DataFrame:
   user_id  movie_title  rating   genre
0        1       asuran     5.0     NaN
1        1  thangamagan     4.0     NaN
2        2       asuran     5.0     NaN
3        2       sullan     5.0  Action
4        3  thangamagan     4.0     NaN
5        3      unknown     3.0     NaN
6        4       sullan     5.0  Action
7        4  thangamagan     5.0     NaN
