In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_json('data/content.jsonl', lines=True)
df.columns

Index(['ItemId', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre',
       'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards',
       'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'totalSeasons',
       'Season', 'Episode', 'seriesID'],
      dtype='object')

In [3]:
df['imdbVotes'].value_counts().get('N/A')

278

In [4]:
df = df.replace('N/A', np.nan)
df['imdbVotes'] = df['imdbVotes'].str.replace(',', '')
df['imdbVotes'] = df['imdbVotes'].astype('float32')

In [5]:
df['imdbVotes'].quantile(0.15)

355.0

In [8]:
def load_content():
    df_content = pd.read_json('data/content.jsonl', lines=True)

    # Getting the Rotten Tomatoes ratings
    rt_ratings = []
    for ratings_list in df_content['Ratings']:
        rt_rating = next((item['Value'] for item in ratings_list if item['Source'] == 'Rotten Tomatoes'), None)
        if rt_rating:
            rt_rating = int(rt_rating[:-1])
        rt_ratings.append(rt_rating)
    df_content['rtRating'] = rt_ratings

    # Getting useful columns
    data_content = df_content[['ItemId', 'Metascore', 'imdbRating', 'imdbVotes', 'rtRating', 'Awards']].copy()

    # Updating 'Awards' column
    data_content['Awards'] = data_content['Awards'].apply(lambda x: 0 if x == 'N/A' else 1)

    # Replacing string 'N/A' with np.nan and removing number separators
    data_content = data_content.replace('N/A', np.nan)
    data_content['imdbVotes'] = data_content['imdbVotes'].str.replace(',', '')

    # Converting to numeric data
    data_content['Metascore'] = data_content['Metascore'].astype('float32')
    data_content['imdbRating'] = data_content['imdbRating'].astype('float32')
    data_content['imdbVotes'] = data_content['imdbVotes'].astype('float32')
    
    # Substitute NaN with mean
    quantiles = data_content.quantile(0.15, numeric_only=True)
    data_content = data_content.fillna(quantiles)
    
    # Normalizing imdbRating between 0 and 10
    for col in data_content.columns:
        if col in ['ItemId', 'Awards']:
            continue
        min_rating = data_content[col].min()
        max_rating = data_content[col].max()
        data_content[col] = 0 + ((data_content[col] - min_rating) * (10 - 0)) / (max_rating - min_rating)
    
    return data_content


In [14]:
df_content = load_content()
df_content.head()

Unnamed: 0,ItemId,Metascore,imdbRating,imdbVotes,rtRating,Awards
0,c9f0f895fb,3.737374,5.0,0.007888,2.9,1
1,d3d9446802,3.737374,6.555555,0.026473,2.9,0
2,c20ad4d76f,3.737374,7.222222,0.045541,2.9,0
3,8e296a067a,3.737374,3.555555,0.000136,2.9,0
4,54229abfcf,3.737374,6.333333,0.013033,2.9,0
