In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_content():
    df_content = pd.read_json('data/content.jsonl', lines=True)

    # getting the rotten tomatoes ratings
    rt_ratings = []
    for ratings_list in df_content['Ratings']:
        rt_rating = next((item['Value'] for item in ratings_list if item['Source'] == 'Rotten Tomatoes'), None)
        if rt_rating:
            rt_rating = int(rt_rating[:-1])
        rt_ratings.append(rt_rating)
    df_content['rtRating'] = rt_ratings

    # getting useful columns
    data_content = df_content[['ItemId', 'Metascore', 'imdbRating', 'imdbVotes', 'rtRating']]

    # replacing string 'N/A' to np.nan and , number separator
    data_content = data_content.replace('N/A', np.nan)
    data_content['imdbVotes'] = data_content['imdbVotes'].str.replace(',', '')

    # converting to numeric data
    data_content['Metascore'] = data_content['Metascore'].astype('Float32')
    data_content['imdbRating'] = data_content['imdbRating'].astype('float32')
    data_content['imdbVotes'] = data_content['imdbVotes'].astype('Float32')
    
    return data_content

In [None]:
df_content = load_content()

In [None]:
for column in df_content.columns:
    # Skip non-numeric columns
    if df_content[column].dtype.kind in 'biufc':
        plt.figure(figsize=(10, 6))
        # Create a histogram and a density plot for the column
        sns.histplot(df_content[column], kde=True, bins=30)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()

In [None]:
means = df_content.mean(numeric_only=True)
df_content2 = df_content.fillna(means)

In [None]:
for column in df_content2.columns:
    # Skip non-numeric columns
    if df_content2[column].dtype.kind in 'biufc':
        plt.figure(figsize=(10, 6))
        # Create a histogram and a density plot for the column
        sns.histplot(df_content2[column], kde=True, bins=30)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()

In [None]:
df_content2.set_index('ItemId')[['Metascore', 'imdbRating', 'imdbVotes', 'rtRating']].to_dict(orient='index')