### Goal 

Predict a movie that someone will like depending on what they have liked before

### Direction
Score prediction from 1->5

### Steps

1. Merge dataset to derive a userId - movie details - ratings  mapping
& Unwrap all nested values + OHE stuff
2. Select userIds with more than a certain number of ratings
3. Analyze the relationships between the movies and the ratings
4. Build model based on relationships
5. Create a flexible model that utilizes multiple users

### Features to add given contextual knowledge
1. Theme
2. Box office
3. Actors

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import ast
import datetime

warnings.filterwarnings("ignore")

In [2]:
ratings = pd.read_csv('moviesRatings/ratings.csv')
ratingsSmall = pd.read_csv('moviesRatings/ratings_small.csv')
moviesMetadata = pd.read_csv('moviesRatings/movies_metadata.csv')

In [3]:
moviesMetadata['id'] = pd.to_numeric(moviesMetadata['id'], errors='coerce').astype('Int64')
result = ratings.merge(moviesMetadata, left_on='movieId', right_on='id', how='left')
result.replace('pandas._libs.missing.NAType', pd.NA, inplace=True)

result = result.drop(columns=['movieId', 'belongs_to_collection', 'homepage', 'poster_path', 'timestamp', 'video', 'status', 'original_title'], index=1)
columns_to_check = ['id']
result = result.dropna(subset=columns_to_check, how='any') #drop rows where movies are unidentified

result = result[result['userId'].isin(result["userId"].value_counts()[result["userId"].value_counts() > 500].index)] #filter for users with more than 500 views
result["adult"] = result["adult"].astype(bool)
result['release_date'] = pd.to_datetime(result['release_date'])
oneDs = result[result['userId'] == 229] #get one dude

#convert to the right data type
result['genres'] = result['genres'].apply(ast.literal_eval)
result['production_companies'] = result['production_companies'].apply(ast.literal_eval)
result['production_countries'] = result['production_countries'].apply(ast.literal_eval)
result['spoken_languages'] = result['spoken_languages'].apply(ast.literal_eval)


#convert the dictionaries into arrays of ids
result['production_companiesId'] = result['production_companies'].apply(lambda x : [item["id"] for item in x] if isinstance(x, list) else [])
result['genreId'] = result['genres'].apply(lambda x : [item["id"] for item in x] if isinstance(x, list) else [])
result['production_countriesId'] = result['production_countries'].apply(lambda x : [item["iso_3166_1"] for item in x] if isinstance(x, list) else [])
result['spoken_languagesId'] = result['spoken_languages'].apply(lambda x : [item["iso_639_1"] for item in x] if isinstance(x, list) else [])

oneDs = result[result['userId'] == 229]


In [26]:
#OHE the generated arrays of ALL the results DO NOT RUN THIS IT TAKES TOO LONG
OHEDF = pd.DataFrame()
exploded_series = result["production_companiesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='production_company').groupby(level=0).sum()])
exploded_series = result["genreId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='genre').groupby(level=0).sum()])
exploded_series = result["production_countriesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='production_country').groupby(level=0).sum()])
exploded_series = result["spoken_languagesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='spoken_language').groupby(level=0).sum()])

result = pd.concat([result, OHEDF])
result = pd.concat([result,pd.get_dummies(result['original_language'], prefix="ogLang")])

selected_col = [col for col in result.columns if any(col.startswith(prefix) for prefix in ['production_company_', 'spoken_languages_', 'production_country_', 'genre_', 'ogLang'])]
result[selected_col] = result[selected_col].replace(np.NaN, 0)

result['production_companiesId'] = result['production_companies'].apply(lambda x : [item["id"] for item in x] if isinstance(x, list) else [])
result['genreId'] = result['genres'].apply(lambda x : [item["id"] for item in x] if isinstance(x, list) else [])
result['production_countriesId'] = result['production_countries'].apply(lambda x : [item["iso_3166_1"] for item in x] if isinstance(x, list) else [])
result['spoken_languagesId'] = result['spoken_languages'].apply(lambda x : [item["iso_639_1"] for item in x] if isinstance(x, list) else [])

print("number of unique users:", len(result['userId'].unique()))

: 

In [4]:
oneDs = result[result['userId'] == 229]

OHEDF = pd.DataFrame()

exploded_series = oneDs["production_companiesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='production_company').groupby(level=0).sum()])

exploded_series = oneDs["genreId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='genre').groupby(level=0).sum()])

exploded_series = oneDs["production_countriesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='production_country').groupby(level=0).sum()])

exploded_series = oneDs["spoken_languagesId"].apply(pd.Series).stack().reset_index(level=1, drop=True)
OHEDF = pd.concat([OHEDF,pd.get_dummies(exploded_series, prefix='spoken_language').groupby(level=0).sum()])

oneDs = pd.concat([oneDs,pd.get_dummies(oneDs['original_language'], prefix="ogLang")])

oneDs = pd.concat([oneDs, OHEDF])

selected_col = [col for col in oneDs.columns if any(col.startswith(prefix) for prefix in ['production_company_', 'spoken_language_', 'production_country_', 'genre_', 'ogLang'])]
oneDs[selected_col] = oneDs[selected_col].replace(np.NaN, 0)

oneDs.head()

#Select columns
corrMat = oneDs[['rating', 'adult', 'budget', 'popularity', 'revenue', 'runtime', 'release_date', 'vote_average', 'vote_count'] + selected_col].corr()

In [5]:
selected_col = [col for col in oneDs.columns if any(col.startswith(prefix) for prefix in [ 'genre_'])]
oneDs[selected_col].describe()

Unnamed: 0,genre_12.0,genre_14.0,genre_16.0,genre_18.0,genre_27.0,genre_28.0,genre_35.0,genre_36.0,genre_37.0,genre_53.0,genre_80.0,genre_99.0,genre_878.0,genre_9648.0,genre_10402.0,genre_10749.0,genre_10751.0,genre_10752.0,genre_10769.0,genre_10770.0
count,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0,3330.0
mean,0.023423,0.013814,0.003604,0.094595,0.016216,0.032432,0.048048,0.007808,0.004204,0.047748,0.033033,0.003904,0.01982,0.015916,0.005405,0.031832,0.009009,0.005105,0.001802,0.001502
std,0.151267,0.116735,0.059931,0.292698,0.126325,0.177172,0.2139,0.088029,0.064713,0.213264,0.17875,0.062369,0.139402,0.125169,0.073333,0.175579,0.094501,0.071278,0.042416,0.038726
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
len(oneDs.columns)

1128

In [15]:
len(result['userId'].unique())

1190

In [12]:
corrMat[corrMat['rating'] >= 0]

Unnamed: 0,rating,adult,budget,popularity,revenue,runtime,release_date,vote_average,vote_count,ogLang_bs,...,spoken_language_sr,spoken_language_sv,spoken_language_sw,spoken_language_th,spoken_language_tr,spoken_language_ur,spoken_language_vi,spoken_language_xx,spoken_language_yi,spoken_language_zh
rating,1.0,,0.029518,0.02074,-0.020628,-0.041594,0.032345,-0.029005,-0.003247,,...,,,,,,,,,,
budget,0.029518,,1.0,0.32166,0.684912,0.272236,0.328652,-0.061735,0.531214,,...,,,,,,,,,,
popularity,0.02074,,0.32166,1.0,0.389983,0.235309,0.13392,0.253052,0.681293,,...,,,,,,,,,,
release_date,0.032345,,0.328652,0.13392,0.22432,0.098055,1.0,-0.209687,0.201452,-0.007763,...,-0.007763,-0.01903,-0.010981,-0.010981,-0.020558,-0.010981,-0.007763,-0.01903,-0.01345,-0.024583


## Charts
1 hot encoding on all ids and get correlation matrix

In [None]:
index = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 1), ('B', 2)], names=['Letter', 'Number'])
data = {'Value': [10, 20, 30, 40]}
df = pd.DataFrame(data, index=index)