In [247]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import decomposition
from sklearn.model_selection import train_test_split


## Data Acquisition
##### Flat Acquisition using csv files

In [248]:
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')
tags = pd.read_csv('./data/tags.csv')
genome_scores = pd.read_csv('./data/genome-scores.csv')
genome_tags = pd.read_csv('./data/genome-tags.csv')

In [249]:
# print the first 5 rows of the movies, ratings, and tags dataframes
print('Movies:')
display(movies.head())
print('Ratings:')
display(ratings.head())
print('Tags:')
display(tags.head())
print('Genome Scores:')
display(genome_scores.head())
print('Genome Tags:')
display(genome_tags.head())

Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


Tags:


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


Genome Scores:


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


Genome Tags:


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


### Data Wrangling

In [250]:
grouped_ratings = ratings.groupby('movieId').agg({'rating': ['mean', 'count'], 'timestamp': ['mean']})
grouped_ratings.columns = ['mean_rating', 'rating_count', 'mean_timestamp']
grouped_ratings = grouped_ratings.astype({'mean_timestamp': 'int'})

# Merge dei dataset movies e ratings
# rimuove i film che non hanno rating
movies_ratings = pd.merge(movies, grouped_ratings, on='movieId')

new_tags = tags.groupby('movieId')['tag'].apply(set).apply(list)

df = pd.merge(movies_ratings, new_tags, on='movieId')

display(df.head())


Unnamed: 0,movieId,title,genres,mean_rating,rating_count,mean_timestamp,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,57309,1153152210,"[comedy, nostalgic, BD-Video, martial arts, im..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,24228,1122310117,"[see also:Zathura, fantasy, itaege, time trave..."
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,11804,980602256,"[sequel fever, grun running, Jack Lemmon, CLV,..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,2523,942460471,"[single mother, interracial relationship, CLV,..."
4,5,Father of the Bride Part II (1995),Comedy,3.058434,11714,1004723013,"[pregnancy, sequel fever, gynecologist, Fantas..."


In [251]:
# Group by tagId and compute mean relevance
mean_relevance = genome_scores.groupby('tagId').mean()['relevance']

# Filter out tags with low mean relevance
threshold = 0.2
good_tags = mean_relevance.where(mean_relevance > threshold).dropna().index
genome_scores_2 = genome_scores[genome_scores['tagId'].isin(good_tags)]

# Merge movies with genome scores
genome_scores_2['tag'] = genome_scores_2['tagId'].map(genome_tags.set_index('tagId')['tag'].to_dict())
genome_table = genome_scores_2.pivot_table(index='movieId', columns='tag', values='relevance')

df = pd.merge(df, genome_table, on='movieId', how='left')
display(df.head(105))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genome_scores_2['tag'] = genome_scores_2['tagId'].map(genome_tags.set_index('tagId')['tag'].to_dict())


Unnamed: 0,movieId,title,genres,mean_rating,rating_count,mean_timestamp,tag,absurd,action,adaptation,...,visceral,visual,visually appealing,visually stunning,weapons,weird,whimsical,witty,women,writers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,57309,1153152210,"[comedy, nostalgic, BD-Video, martial arts, im...",0.10400,0.66250,0.31675,...,0.15150,0.56375,0.31500,0.67325,0.26375,0.42700,0.58700,0.69400,0.08925,0.14125
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,24228,1122310117,"[see also:Zathura, fantasy, itaege, time trave...",0.15925,0.64025,0.51450,...,0.07325,0.38150,0.22500,0.21700,0.17800,0.38650,0.29250,0.18725,0.13525,0.12225
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,11804,980602256,"[sequel fever, grun running, Jack Lemmon, CLV,...",0.11375,0.16025,0.25200,...,0.10175,0.10725,0.19600,0.09850,0.14125,0.24350,0.13025,0.22325,0.35075,0.12200
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,2523,942460471,"[single mother, interracial relationship, CLV,...",0.13375,0.14700,0.50700,...,0.08800,0.10750,0.21150,0.11625,0.28950,0.21975,0.14775,0.10725,0.97525,0.18200
4,5,Father of the Bride Part II (1995),Comedy,3.058434,11714,1004723013,"[pregnancy, sequel fever, gynecologist, Fantas...",0.15475,0.15575,0.28925,...,0.08275,0.11925,0.18500,0.11875,0.12025,0.29350,0.16425,0.10475,0.40225,0.19225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,106,Nobody Loves Me (Keiner liebt mich) (1994),Comedy|Drama,3.436000,125,1008437680,"[prophecy, voodoo, rich man, single, woman dir...",0.15375,0.04700,0.22050,...,0.06225,0.16825,0.16175,0.06375,0.07975,0.38200,0.27100,0.18650,0.42375,0.26175
101,107,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical,3.216874,6015,1050347045,"[childhood, David Goelz, talking animals, musi...",0.10125,0.14925,0.42125,...,0.10350,0.14900,0.25700,0.13050,0.19675,0.42400,0.30325,0.18150,0.12725,0.21100
102,109,Headless Body in Topless Bar (1995),Comedy|Drama|Thriller,2.062500,16,979827864,[ex-con],,,,...,,,,,,,,,,
103,110,Braveheart (1995),Action|Drama|War,4.002273,59184,1127808200,"[highlands, BD-Video, Mel Gibson needs to make...",0.16600,0.91850,0.45250,...,0.85425,0.69975,0.51450,0.69250,0.49225,0.25425,0.13300,0.26775,0.12825,0.10250


### Data Preparation (Data cleaning, Duplicates filtering, Data encoding)

##### Data Cleaning
The following results show that there are no NaN values in the dataset.

In [252]:
# print number of rows
print('Number of rows: ', df.shape[0])

# show rows with missing values
display(df[df.isnull().any(axis=1)].head())

# drop rows with missing values
df = df.fillna(0)

# show rows 102, 123, 126, 127, 134
display(df.iloc[[102, 123, 126, 127, 134]])

# print number of rows after dropping missing values
print('Number of rows after dropping missing values: ', df.shape[0])


Number of rows:  41875


Unnamed: 0,movieId,title,genres,mean_rating,rating_count,mean_timestamp,tag,absurd,action,adaptation,...,visceral,visual,visually appealing,visually stunning,weapons,weird,whimsical,witty,women,writers
102,109,Headless Body in Topless Bar (1995),Comedy|Drama|Thriller,2.0625,16,979827864,[ex-con],,,,...,,,,,,,,,,
123,130,Angela (1995),Drama,3.22619,42,1161273282,"[independent film, woman director, isolation]",,,,...,,,,,,,,,,
126,133,Nueba Yol (1995),Comedy|Drama,2.6,20,998729374,"[emigration, drama, new york, spanish, dominic...",,,,...,,,,,,,,,,
127,134,Sonic Outlaws (1995),Documentary,3.451613,31,1036815825,"[illegal art, Craig Baldwin]",,,,...,,,,,,,,,,
134,143,Gospa (1995),Drama,1.928571,14,1055044483,"[apparition, virgin mary, miracle, mountain, c...",,,,...,,,,,,,,,,


Unnamed: 0,movieId,title,genres,mean_rating,rating_count,mean_timestamp,tag,absurd,action,adaptation,...,visceral,visual,visually appealing,visually stunning,weapons,weird,whimsical,witty,women,writers
102,109,Headless Body in Topless Bar (1995),Comedy|Drama|Thriller,2.0625,16,979827864,[ex-con],0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,130,Angela (1995),Drama,3.22619,42,1161273282,"[independent film, woman director, isolation]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,133,Nueba Yol (1995),Comedy|Drama,2.6,20,998729374,"[emigration, drama, new york, spanish, dominic...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,134,Sonic Outlaws (1995),Documentary,3.451613,31,1036815825,"[illegal art, Craig Baldwin]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,143,Gospa (1995),Drama,1.928571,14,1055044483,"[apparition, virgin mary, miracle, mountain, c...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Number of rows after dropping missing values:  41875


##### Data Encoding
Multicategorical One-Hot encoding for film genres

In [253]:
# one hot encoding for genres
# print every category in movies dataset
categories = set()
for s in df['genres'].str.split('|').values:
    categories = categories.union(set(s))

# remove (no genres listed) from categories
categories.remove('(no genres listed)')

# add columns for each category
for category in categories:
    df[category] = df['genres'].str.contains(category).astype(int)

# delete genres column
df = df.drop(columns=['genres'])

display(df.head())

Unnamed: 0,movieId,title,mean_rating,rating_count,mean_timestamp,tag,absurd,action,adaptation,adapted from:book,...,Documentary,Animation,Adventure,Romance,Film-Noir,Drama,Comedy,Action,IMAX,Musical
0,1,Toy Story (1995),3.893708,57309,1153152210,"[comedy, nostalgic, BD-Video, martial arts, im...",0.104,0.6625,0.31675,0.286,...,0,1,1,0,0,0,1,0,0,0
1,2,Jumanji (1995),3.251527,24228,1122310117,"[see also:Zathura, fantasy, itaege, time trave...",0.15925,0.64025,0.5145,0.4845,...,0,0,1,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),3.142028,11804,980602256,"[sequel fever, grun running, Jack Lemmon, CLV,...",0.11375,0.16025,0.252,0.19375,...,0,0,0,1,0,0,1,0,0,0
3,4,Waiting to Exhale (1995),2.853547,2523,942460471,"[single mother, interracial relationship, CLV,...",0.13375,0.147,0.507,0.46175,...,0,0,0,1,0,1,1,0,0,0
4,5,Father of the Bride Part II (1995),3.058434,11714,1004723013,"[pregnancy, sequel fever, gynecologist, Fantas...",0.15475,0.15575,0.28925,0.198,...,0,0,0,0,0,0,1,0,0,0


##### Duplicates Filtering
The following results show that there are no duplicates in the dataset (as expected after merge operation).

In [254]:
df = df.copy()

# drop tag column
df = df.drop(columns=['tag'])

# print number of rows
print('Number of rows: ', df.shape[0])

# drop duplicated rows
df = df.drop_duplicates()

# print number of rows after dropping missing values
print('Number of rows after dropping duplicated rows: ', df.shape[0])


Number of rows:  41875
Number of rows after dropping duplicated rows:  41875


## Data Visualization

In [255]:
""" # rating distribution from ratings.csv
sns.countplot(x='rating', data=ratings)
plt.show()

sns.boxplot(x='rating', data=ratings)
plt.show() """

" # rating distribution from ratings.csv\nsns.countplot(x='rating', data=ratings)\nplt.show()\n\nsns.boxplot(x='rating', data=ratings)\nplt.show() "

Rather than using discrete bins, a KDE plot smooths the observations with a Gaussian kernel, producing a continuous density estimate. This is used for continuous attributes like rating mean.

It is done to show differences after data aggregation in calculating mean for each film.

In [256]:
""" # rating distribution from df
sns.kdeplot(df['mean_rating'])
plt.xlabel('rating mean for film')
plt.title('Rating distribution in dataframe')
plt.show() """

" # rating distribution from df\nsns.kdeplot(df['mean_rating'])\nplt.xlabel('rating mean for film')\nplt.title('Rating distribution in dataframe')\nplt.show() "

Observing the distribution of all attributes for every file.

This is useful to show data trends, to reveal outliers and leverage points, to provide hints about modeling techniques to apply.


In [257]:
""" # print distribution for ratings
fig, axs = plt.subplots(3)
sns.kdeplot(ratings['timestamp'], ax=axs[0], color='r', label='timestamp')
sns.kdeplot(ratings['rating'], ax=axs[1], color='b', label='rating')
sns.kdeplot(ratings['movieId'], ax=axs[2], color='g', label='movieId')

axs[0].set_xlabel('timestamp')
axs[1].set_xlabel('rating')
axs[2].set_xlabel('movieId')
plt.show() """

" # print distribution for ratings\nfig, axs = plt.subplots(3)\nsns.kdeplot(ratings['timestamp'], ax=axs[0], color='r', label='timestamp')\nsns.kdeplot(ratings['rating'], ax=axs[1], color='b', label='rating')\nsns.kdeplot(ratings['movieId'], ax=axs[2], color='g', label='movieId')\n\naxs[0].set_xlabel('timestamp')\naxs[1].set_xlabel('rating')\naxs[2].set_xlabel('movieId')\nplt.show() "

In [258]:
""" movies_to_show = 5
fig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))

most_rated_movies = ratings.groupby('movieId').count().sort_values('rating', ascending=False).head(movies_to_show).index

to_line_plot = []
for i in range(movies_to_show):
    tmp = ratings.where(ratings['movieId'] == most_rated_movies[i]).dropna()
    tmp['date'] = pd.to_datetime(tmp['timestamp'], unit='s')
    
    # Computing the mean for each month
    tmp = tmp.resample("M", on='date').mean()[['movieId', 'rating']].dropna()
    to_line_plot.append(tmp)
    movie_title = movies.where(movies['movieId'] == most_rated_movies[i]).dropna()['title'].values[0]
    axs[i].set_title("Movie: '" + movie_title + "'")
    

for i in range(movies_to_show):
    sns.lineplot(x='date', y='rating', ax=axs[i], data=to_line_plot[i])

fig.suptitle('Rating evolution for the 5 most rated movies')
plt.show() """

' movies_to_show = 5\nfig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))\n\nmost_rated_movies = ratings.groupby(\'movieId\').count().sort_values(\'rating\', ascending=False).head(movies_to_show).index\n\nto_line_plot = []\nfor i in range(movies_to_show):\n    tmp = ratings.where(ratings[\'movieId\'] == most_rated_movies[i]).dropna()\n    tmp[\'date\'] = pd.to_datetime(tmp[\'timestamp\'], unit=\'s\')\n    \n    # Computing the mean for each month\n    tmp = tmp.resample("M", on=\'date\').mean()[[\'movieId\', \'rating\']].dropna()\n    to_line_plot.append(tmp)\n    movie_title = movies.where(movies[\'movieId\'] == most_rated_movies[i]).dropna()[\'title\'].values[0]\n    axs[i].set_title("Movie: \'" + movie_title + "\'")\n    \n\nfor i in range(movies_to_show):\n    sns.lineplot(x=\'date\', y=\'rating\', ax=axs[i], data=to_line_plot[i])\n\nfig.suptitle(\'Rating evolution for the 5 most rated movies\')\nplt.show() '

In [259]:
""" # Same as above cell but the 3 movies with the highest standard deviation
movies_to_show = 3
fig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))

# Take movies with at least 1000 ratings
high_std_movies = ratings.where(
    ratings['movieId']
    .isin(ratings.groupby('movieId')
          .count()
          .where(ratings.groupby('movieId').count()['rating'] > 1000)
          .dropna().index)
    ).dropna()

# most_rated_movies = ratings.groupby('movieId').count().sort_values('rating', ascending=False).head(movies_to_show).index
high_std_movies = high_std_movies.groupby('movieId').std().sort_values('rating', ascending=False).head(movies_to_show).index

to_line_plot = []
for i in range(movies_to_show):
    tmp = ratings.where(ratings['movieId'] == high_std_movies[i]).dropna()
    tmp['date'] = pd.to_datetime(tmp['timestamp'], unit='s')
    
    # Computing the mean for each month
    tmp = tmp.resample("M", on='date').mean()[['movieId', 'rating']].dropna()
    to_line_plot.append(tmp)
    movie_title = movies.where(movies['movieId'] == high_std_movies[i]).dropna()['title'].values[0]
    axs[i].set_title("Movie: '" + movie_title + "'")

for i in range(movies_to_show):
    sns.lineplot(x='date', y='rating', ax=axs[i], data=to_line_plot[i])

# Print the titles of the movies
plt.show() """

' # Same as above cell but the 3 movies with the highest standard deviation\nmovies_to_show = 3\nfig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))\n\n# Take movies with at least 1000 ratings\nhigh_std_movies = ratings.where(\n    ratings[\'movieId\']\n    .isin(ratings.groupby(\'movieId\')\n          .count()\n          .where(ratings.groupby(\'movieId\').count()[\'rating\'] > 1000)\n          .dropna().index)\n    ).dropna()\n\n# most_rated_movies = ratings.groupby(\'movieId\').count().sort_values(\'rating\', ascending=False).head(movies_to_show).index\nhigh_std_movies = high_std_movies.groupby(\'movieId\').std().sort_values(\'rating\', ascending=False).head(movies_to_show).index\n\nto_line_plot = []\nfor i in range(movies_to_show):\n    tmp = ratings.where(ratings[\'movieId\'] == high_std_movies[i]).dropna()\n    tmp[\'date\'] = pd.to_datetime(tmp[\'timestamp\'], unit=\'s\')\n    \n    # Computing the mean for each month\n    tmp = tmp.resample("M", 

Plot density for every attribute of dataset

In [260]:
""" # density plot for rating_count
fig, axs = plt.subplots(1,2, figsize=(10, 5))
sns.kdeplot(df['rating_count'], ax=axs[0], label='rating_count')
sns.kdeplot(df['rating_count'].where(df['rating_count'] < 50), ax=axs[1], label='rating_count < 50')

print('Number of movies with rating_count < 50: ', df.where(df['rating_count'] < 50).count()[0])
print('Number of all movies: ', df.shape[0]) """

" # density plot for rating_count\nfig, axs = plt.subplots(1,2, figsize=(10, 5))\nsns.kdeplot(df['rating_count'], ax=axs[0], label='rating_count')\nsns.kdeplot(df['rating_count'].where(df['rating_count'] < 50), ax=axs[1], label='rating_count < 50')\n\nprint('Number of movies with rating_count < 50: ', df.where(df['rating_count'] < 50).count()[0])\nprint('Number of all movies: ', df.shape[0]) "

In [261]:
""" # density plot for rating year

df_timestamp = df.copy()

# convert timestamp to year
df_timestamp['year_timestamp'] = pd.to_datetime(df_timestamp['mean_timestamp'], unit='s').dt.year

# delete mean_timestamp column to avoid redundancy - TODO check if it is better to keep it
df_timestamp = df_timestamp.drop(columns=['mean_timestamp'])

sns.kdeplot(df['year_timestamp'])
plt.show() """

" # density plot for rating year\n\ndf_timestamp = df.copy()\n\n# convert timestamp to year\ndf_timestamp['year_timestamp'] = pd.to_datetime(df_timestamp['mean_timestamp'], unit='s').dt.year\n\n# delete mean_timestamp column to avoid redundancy - TODO check if it is better to keep it\ndf_timestamp = df_timestamp.drop(columns=['mean_timestamp'])\n\nsns.kdeplot(df['year_timestamp'])\nplt.show() "

In [262]:
""" 
# TODO - change visualization to show the relevance for each movie-tag
# plot for categories
cat = list(categories)

# count the number of rows containing 1 for each category
cat_dict = {}
for category in cat:
    cat_dict[category] = df[category].sum()

# order the dictionary by value in descending order
cat_dict = {k: v for k, v in sorted(cat_dict.items(), key=lambda item: item[1], reverse=True)}

# plot an histogram of the number of rows for each category
fig, axs = plt.subplots(1, 1, figsize=(20, 10))
ax = sns.barplot(x=list(cat_dict.keys()), y=list(cat_dict.values()))
ax.bar_label(container=ax.containers[0], labels=list(cat_dict.keys()))
plt.xticks([])
plt.show()

print('List of attributes: ', list(df.columns))
 """

" \n# TODO - change visualization to show the relevance for each movie-tag\n# plot for categories\ncat = list(categories)\n\n# count the number of rows containing 1 for each category\ncat_dict = {}\nfor category in cat:\n    cat_dict[category] = df[category].sum()\n\n# order the dictionary by value in descending order\ncat_dict = {k: v for k, v in sorted(cat_dict.items(), key=lambda item: item[1], reverse=True)}\n\n# plot an histogram of the number of rows for each category\nfig, axs = plt.subplots(1, 1, figsize=(20, 10))\nax = sns.barplot(x=list(cat_dict.keys()), y=list(cat_dict.values()))\nax.bar_label(container=ax.containers[0], labels=list(cat_dict.keys()))\nplt.xticks([])\nplt.show()\n\nprint('List of attributes: ', list(df.columns))\n "

In [263]:
""" # count the number of rows containing 1 for Film-Noir and IMAX
print('Number of Film-Noir movies: ', df['Film-Noir'].sum())
print('Number of IMAX movies: ', df['IMAX'].sum())
 """

" # count the number of rows containing 1 for Film-Noir and IMAX\nprint('Number of Film-Noir movies: ', df['Film-Noir'].sum())\nprint('Number of IMAX movies: ', df['IMAX'].sum())\n "

In [264]:
""" # compute the ratio of mismatches for each category

# count the number of rows
total_rows = df.shape[0]

mismatches = {}
# remove (no genres listed) from list categories
cat = list(categories)

for category1 in cat:
    for category2 in cat:
        if category1 != category2:
            # count the number of rows where category1 and category2 are both 1
            count = df.where((df[category1] == 1) & (df[category2] == 1)).count()[0]
            if count > 0:
                # if there are rows where category1 and category2 are both 1, add the ratio to the dictionary
                mismatches[category1 + ' - ' + category2] = (total_rows - count) / total_rows

# order the dictionary by value in descending order and print it
mismatches = {k: v for k, v in sorted(mismatches.items(), key=lambda item: item[1], reverse=False)}
display('Mismatches: ', mismatches) """


" # compute the ratio of mismatches for each category\n\n# count the number of rows\ntotal_rows = df.shape[0]\n\nmismatches = {}\n# remove (no genres listed) from list categories\ncat = list(categories)\n\nfor category1 in cat:\n    for category2 in cat:\n        if category1 != category2:\n            # count the number of rows where category1 and category2 are both 1\n            count = df.where((df[category1] == 1) & (df[category2] == 1)).count()[0]\n            if count > 0:\n                # if there are rows where category1 and category2 are both 1, add the ratio to the dictionary\n                mismatches[category1 + ' - ' + category2] = (total_rows - count) / total_rows\n\n# order the dictionary by value in descending order and print it\nmismatches = {k: v for k, v in sorted(mismatches.items(), key=lambda item: item[1], reverse=False)}\ndisplay('Mismatches: ', mismatches) "

In [265]:
""" from scipy.spatial.distance import pdist, squareform
# compute the distance matrix for categories
display(df.head())
categories_df = df[df.drop(columns=['movieId', 'title', 'mean_rating', 'rating_count', 'tag', 'year_timestamp']).columns]
display(categories_df.head())

dist_matrix = pdist(categories_df, 'cosine')

# Create a pandas DataFrame to represent the distance matrix
df_dist = pd.DataFrame(squareform(dist_matrix), columns=np.arange(categories_df.shape[0]), index=np.arange(categories_df.shape[0]))

# Print the DataFrame with observation labels
obs_labels = ['Obs{}'.format(i) for i in range(categories_df.shape[0])]
df_dist.index = obs_labels
df_dist.columns = obs_labels
display(df_dist.head())

# show only rows with at least two columns with value 0
mask = (df_dist == 0).sum(axis=1) >= 2
result = df_dist.loc[mask].dropna(axis=1)
display(result.head())

# print(result[0:1].where(result[0:1] == 0).dropna(axis=1))

print('Number of rows with at least two columns with value 0: ', result.shape[0], ' out of ', df_dist.shape[0])
 """

" from scipy.spatial.distance import pdist, squareform\n# compute the distance matrix for categories\ndisplay(df.head())\ncategories_df = df[df.drop(columns=['movieId', 'title', 'mean_rating', 'rating_count', 'tag', 'year_timestamp']).columns]\ndisplay(categories_df.head())\n\ndist_matrix = pdist(categories_df, 'cosine')\n\n# Create a pandas DataFrame to represent the distance matrix\ndf_dist = pd.DataFrame(squareform(dist_matrix), columns=np.arange(categories_df.shape[0]), index=np.arange(categories_df.shape[0]))\n\n# Print the DataFrame with observation labels\nobs_labels = ['Obs{}'.format(i) for i in range(categories_df.shape[0])]\ndf_dist.index = obs_labels\ndf_dist.columns = obs_labels\ndisplay(df_dist.head())\n\n# show only rows with at least two columns with value 0\nmask = (df_dist == 0).sum(axis=1) >= 2\nresult = df_dist.loc[mask].dropna(axis=1)\ndisplay(result.head())\n\n# print(result[0:1].where(result[0:1] == 0).dropna(axis=1))\n\nprint('Number of rows with at least two col

In [266]:
""" from scipy.spatial.distance import squareform

# Create a pandas DataFrame to represent the distance matrix
df_dist = pd.DataFrame(squareform(dist_matrix), columns=np.arange(categories_df.shape[0]), index=np.arange(categories_df.shape[0]))

# Print the DataFrame with observation labels
obs_labels = ['Obs{}'.format(i) for i in range(categories_df.shape[0])]
df_dist.index = obs_labels
df_dist.columns = obs_labels
display(df_dist.head())

# show only rows with at least two columns with value 0
mask = (df_dist == 0).sum(axis=1) >= 2
result = df_dist.loc[mask].dropna(axis=1)
display(result.head())

# print(result[0:1].where(result[0:1] == 0).dropna(axis=1))

print('Number of rows with at least two columns with value 0: ', result.shape[0], ' out of ', df_dist.shape[0]) """

" from scipy.spatial.distance import squareform\n\n# Create a pandas DataFrame to represent the distance matrix\ndf_dist = pd.DataFrame(squareform(dist_matrix), columns=np.arange(categories_df.shape[0]), index=np.arange(categories_df.shape[0]))\n\n# Print the DataFrame with observation labels\nobs_labels = ['Obs{}'.format(i) for i in range(categories_df.shape[0])]\ndf_dist.index = obs_labels\ndf_dist.columns = obs_labels\ndisplay(df_dist.head())\n\n# show only rows with at least two columns with value 0\nmask = (df_dist == 0).sum(axis=1) >= 2\nresult = df_dist.loc[mask].dropna(axis=1)\ndisplay(result.head())\n\n# print(result[0:1].where(result[0:1] == 0).dropna(axis=1))\n\nprint('Number of rows with at least two columns with value 0: ', result.shape[0], ' out of ', df_dist.shape[0]) "

In [267]:
""" # show hist plot for mean_rating-genres
fig, axs = plt.subplots(10, 2, figsize=(15, 20))
# create a df with only mean_rating and genres where genres are 1
cat = list(categories)
for category in cat:
    df_genre = df[df[category] == 1]
    if cat.index(category) < 10:
        sns.kdeplot(df_genre['mean_rating'], ax=axs[cat.index(category), 0], label=category)
    else:
        sns.kdeplot(df_genre['mean_rating'], ax=axs[cat.index(category)-10, 1], label=category)

plt.show() """


" # show hist plot for mean_rating-genres\nfig, axs = plt.subplots(10, 2, figsize=(15, 20))\n# create a df with only mean_rating and genres where genres are 1\ncat = list(categories)\nfor category in cat:\n    df_genre = df[df[category] == 1]\n    if cat.index(category) < 10:\n        sns.kdeplot(df_genre['mean_rating'], ax=axs[cat.index(category), 0], label=category)\n    else:\n        sns.kdeplot(df_genre['mean_rating'], ax=axs[cat.index(category)-10, 1], label=category)\n\nplt.show() "

## Data Preprocessing

In [268]:
# from title extract year
df['year'] = df['title'].str.extract('(\(\d{4}\))', expand=True)
# remove parentheses
df['year'] = df['year'].str.extract('(\d{4})', expand=True)
# convert year to int
df['year'] = df['year']

# remove movies with no year
df = df.dropna(subset=['year'])

# delete title and movieId columns
df = df.drop(columns=['movieId', 'title'])

display(df.head())

Unnamed: 0,mean_rating,rating_count,mean_timestamp,absurd,action,adaptation,adapted from:book,adventure,affectionate,allegory,...,Animation,Adventure,Romance,Film-Noir,Drama,Comedy,Action,IMAX,Musical,year
0,3.893708,57309,1153152210,0.104,0.6625,0.31675,0.286,0.89375,0.67625,0.246,...,1,1,0,0,0,1,0,0,0,1995
1,3.251527,24228,1122310117,0.15925,0.64025,0.5145,0.4845,0.976,0.12675,0.1775,...,0,1,0,0,0,0,0,0,0,1995
2,3.142028,11804,980602256,0.11375,0.16025,0.252,0.19375,0.3215,0.0955,0.10775,...,0,0,1,0,0,1,0,0,0,1995
3,2.853547,2523,942460471,0.13375,0.147,0.507,0.46175,0.14875,0.1315,0.0775,...,0,0,1,0,1,1,0,0,0,1995
4,3.058434,11714,1004723013,0.15475,0.15575,0.28925,0.198,0.1635,0.11875,0.06975,...,0,0,0,0,0,1,0,0,0,1995


In [269]:
""" # density plot for film year
sns.kdeplot(df['year'])
plt.show() """

" # density plot for film year\nsns.kdeplot(df['year'])\nplt.show() "

### Correlation results
This correlation table shows the relationship between different movie genres and several movie characteristics such as mean rating, rating count, and year of release. The values in the table represent the Pearson correlation coefficient, which ranges from -1 to 1 and measures the linear association between two variables. Positive values indicate a positive association, meaning that as one variable increases, the other variable also increases. Negative values indicate a negative association, meaning that as one variable increases, the other variable decreases.

Some observations from the table:

There is a positive correlation between the mean rating and rating count of a movie (0.13). This indicates that movies with higher ratings tend to have more ratings.
Film-Noir has a moderate positive correlation with Crime (0.15) and Mystery (0.06). This suggests that movies classified as Film-Noir often have elements of crime and mystery.
Horror movies have a negative correlation with mean rating (-0.22), indicating that they tend to have lower ratings.
Drama movies have a moderate positive correlation with mean rating (0.15) and a moderate positive correlation with Thriller movies (0.24).
There is a negative correlation between year of release and rating count (-0.27), which suggests that older movies tend to have fewer ratings.
In conclusion, this table provides useful information about the relationship between different movie genres and movie characteristics, and can be used to make informed decisions about movie selection and production.


### Covariance results
Covariance is a measure of the linear relationship between two variables. It measures how changes in one variable are associated with changes in the other variable. Covariance is expressed as a numerical value and can range from negative to positive values. A positive covariance means that the two variables are positively related, while a negative covariance means that they are inversely related. A covariance of zero means that there is no linear relationship between the variables (i.e. variables are indipendent).

In the table provided, the covariance between two variables can be found in the entries of the matrix. For example, the covariance between "mean_rating" and "Film-Noir" is 0.001774, indicating a very small positive relationship between the two variables. The covariance between "rating_count" and "Crime" is 43.331685, indicating a stronger positive relationship between these two variables.

It's important to keep in mind that covariance only measures linear relationships, so it may not be able to fully capture more complex relationships between variables.

As it's possible to see in the table, variables tend to be indipendent from each others, this is not valid only for two variables: rating_count and mean_timestamp. These two attributes are highly dipendent with all other attributes.

### Dataset Describe
The describe() function applied on the Pandas DataFrame provides statistical information about the features in the DataFrame. According to the results, the mean rating for the movies is 3.110263, with a standard deviation of 0.653225. The minimum and maximum ratings are 0.5 and 5, respectively. The mean count of ratings for each movie is 596.81, with a standard deviation of 2929.96.

The genre columns (Film-Noir, Crime, Mystery, Animation, etc.) are binary, with a mean of either 0 or 0.01 to 0.28. The 25th, 50th, and 75th percentiles for these features are either 0 or close to 0, indicating that the majority of the movies belong to a particular genre. The year_timestamp feature has a mean of 2014.22, with a standard deviation of 4.7. The year feature has a mean of 1991.57 and a standard deviation of 25.09.

In [270]:
""" # compute analysis between attributes of the dataset
corr = df.corr()
print(corr)
cov = df.cov()
print(cov)
desc = df.describe()
print(desc)

# follow examples from slides on Data Visualization (pages 6-8)
fig, axs = plt.subplots(2, 1, figsize=(30, 30))
# show results of correlation in a graphic way
# Plot the heatmap of the correlation matrix
sns.heatmap(corr, annot=True, cmap="YlGnBu", ax=axs[0])
# show results of covariance in a graphic way
# Plot the heatmap of the covariance matrix
sns.heatmap(cov, annot=True, cmap="YlGnBu", ax=axs[1], vmax=1, vmin=-1)

plt.show()

# Plot the table
plt.figure(figsize=(30, 5))
sns.heatmap(desc, annot=True, cmap='Reds', vmax=1, vmin=0)
plt.show()
 """

' # compute analysis between attributes of the dataset\ncorr = df.corr()\nprint(corr)\ncov = df.cov()\nprint(cov)\ndesc = df.describe()\nprint(desc)\n\n# follow examples from slides on Data Visualization (pages 6-8)\nfig, axs = plt.subplots(2, 1, figsize=(30, 30))\n# show results of correlation in a graphic way\n# Plot the heatmap of the correlation matrix\nsns.heatmap(corr, annot=True, cmap="YlGnBu", ax=axs[0])\n# show results of covariance in a graphic way\n# Plot the heatmap of the covariance matrix\nsns.heatmap(cov, annot=True, cmap="YlGnBu", ax=axs[1], vmax=1, vmin=-1)\n\nplt.show()\n\n# Plot the table\nplt.figure(figsize=(30, 5))\nsns.heatmap(desc, annot=True, cmap=\'Reds\', vmax=1, vmin=0)\nplt.show()\n '

In [271]:
""" # show scatter plot for mean_rating-attributes

fig, axs = plt.subplots(2, 2, figsize=(15, 10))
sns.scatterplot(x='rating_count', y='mean_rating', data=df, ax=axs[0, 0])
sns.scatterplot(x='year', y='mean_rating', data=df, ax=axs[0, 1])
sns.scatterplot(x='year_timestamp', y='mean_rating', data=df, ax=axs[1, 0])
plt.show()
 """

" # show scatter plot for mean_rating-attributes\n\nfig, axs = plt.subplots(2, 2, figsize=(15, 10))\nsns.scatterplot(x='rating_count', y='mean_rating', data=df, ax=axs[0, 0])\nsns.scatterplot(x='year', y='mean_rating', data=df, ax=axs[0, 1])\nsns.scatterplot(x='year_timestamp', y='mean_rating', data=df, ax=axs[1, 0])\nplt.show()\n "

In [272]:
# remove rating_count column
df = df.drop(columns=['rating_count'])

display(df.head())

Unnamed: 0,mean_rating,mean_timestamp,absurd,action,adaptation,adapted from:book,adventure,affectionate,allegory,art,...,Animation,Adventure,Romance,Film-Noir,Drama,Comedy,Action,IMAX,Musical,year
0,3.893708,1153152210,0.104,0.6625,0.31675,0.286,0.89375,0.67625,0.246,0.20125,...,1,1,0,0,0,1,0,0,0,1995
1,3.251527,1122310117,0.15925,0.64025,0.5145,0.4845,0.976,0.12675,0.1775,0.1495,...,0,1,0,0,0,0,0,0,0,1995
2,3.142028,980602256,0.11375,0.16025,0.252,0.19375,0.3215,0.0955,0.10775,0.09525,...,0,0,1,0,0,1,0,0,0,1995
3,2.853547,942460471,0.13375,0.147,0.507,0.46175,0.14875,0.1315,0.0775,0.17275,...,0,0,1,0,1,1,0,0,0,1995
4,3.058434,1004723013,0.15475,0.15575,0.28925,0.198,0.1635,0.11875,0.06975,0.09725,...,0,0,0,0,0,1,0,0,0,1995


In [273]:
""" # TODO - balancing dataset only for training set
# remove samples_to_drop movies with mean_rating between 2.5 and 4
samples_to_drop = 25000
df_preprocessed = df.copy()
df_preprocessed = df_preprocessed.drop(df_preprocessed[(df_preprocessed['mean_rating'] >= 2.5) & (df_preprocessed['mean_rating'] <= 4)].sample(samples_to_drop).index)

# rating distribution from df
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
sns.kdeplot(df['mean_rating'], ax=axs[0])
sns.kdeplot(df_preprocessed['mean_rating'], ax=axs[1])
plt.xlabel('rating mean for film')
plt.title('Rating distribution in dataframe')
plt.show()

# number of samples in df and df_preprocessed
print('Number of samples in df: ', df.shape[0])
print('Number of samples in df_preprocessed: ', df_preprocessed.shape[0]) """


" # TODO - balancing dataset only for training set\n# remove samples_to_drop movies with mean_rating between 2.5 and 4\nsamples_to_drop = 25000\ndf_preprocessed = df.copy()\ndf_preprocessed = df_preprocessed.drop(df_preprocessed[(df_preprocessed['mean_rating'] >= 2.5) & (df_preprocessed['mean_rating'] <= 4)].sample(samples_to_drop).index)\n\n# rating distribution from df\nfig, axs = plt.subplots(1, 2, figsize=(10, 5))\nsns.kdeplot(df['mean_rating'], ax=axs[0])\nsns.kdeplot(df_preprocessed['mean_rating'], ax=axs[1])\nplt.xlabel('rating mean for film')\nplt.title('Rating distribution in dataframe')\nplt.show()\n\n# number of samples in df and df_preprocessed\nprint('Number of samples in df: ', df.shape[0])\nprint('Number of samples in df_preprocessed: ', df_preprocessed.shape[0]) "

In [274]:
""" # Compute the number of unique values for each column
unique_values = {}
for column in df.columns:
    if column == 'mean_rating' or column == 'mean_timestamp' or column == 'year':
        unique_values[column] = np.unique(df[column], return_counts=True)

# Print the number of unique values for each column
for column in unique_values:
    print(column, ':', unique_values[column])

# Plot the number of unique values for each column
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
sns.lineplot(x=unique_values['mean_rating'][0], y=unique_values['mean_rating'][1], ax=axs[0,0])
axs[0,0].set_xlabel('mean rating')
axs[0,0].set_ylabel('number of unique values')
sns.lineplot(x=unique_values['mean_timestamp'][0], y=unique_values['mean_timestamp'][1], ax=axs[0,1])
axs[0,1].set_xlabel('mean timestamp')
axs[0,1].set_ylabel('number of unique values')
sns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[1,0])
axs[1,0].set_xlabel('year')
axs[1,0].set_ylabel('number of unique values')
plt.show()
 """

" # Compute the number of unique values for each column\nunique_values = {}\nfor column in df.columns:\n    if column == 'mean_rating' or column == 'mean_timestamp' or column == 'year':\n        unique_values[column] = np.unique(df[column], return_counts=True)\n\n# Print the number of unique values for each column\nfor column in unique_values:\n    print(column, ':', unique_values[column])\n\n# Plot the number of unique values for each column\nfig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))\nsns.lineplot(x=unique_values['mean_rating'][0], y=unique_values['mean_rating'][1], ax=axs[0,0])\naxs[0,0].set_xlabel('mean rating')\naxs[0,0].set_ylabel('number of unique values')\nsns.lineplot(x=unique_values['mean_timestamp'][0], y=unique_values['mean_timestamp'][1], ax=axs[0,1])\naxs[0,1].set_xlabel('mean timestamp')\naxs[0,1].set_ylabel('number of unique values')\nsns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[1,0])\naxs[1,0].set_xlabel('year')\naxs[1,0].

In [275]:
""" # show year and year_timestamp in the same plot with values from 2000 to 2020
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
sns.lineplot(x=unique_values['year_timestamp'][0], y=unique_values['year_timestamp'][1], ax=axs[0], label='timestamp year')
sns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[0], label='film year')
axs[0].set_xlabel('year')
axs[0].set_ylabel('number of unique values')
sns.lineplot(x=unique_values['year_timestamp'][0], y=unique_values['year_timestamp'][1], ax=axs[1], label='timestamp year')
sns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[1], label='film year')
axs[1].set_xlabel('year')
axs[1].set_ylabel('number of unique values')
axs[1].set_xlim(2000, 2020)
axs[1].set_xticks(np.arange(2000, 2022, 4))
plt.show() """


" # show year and year_timestamp in the same plot with values from 2000 to 2020\nfig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))\nsns.lineplot(x=unique_values['year_timestamp'][0], y=unique_values['year_timestamp'][1], ax=axs[0], label='timestamp year')\nsns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[0], label='film year')\naxs[0].set_xlabel('year')\naxs[0].set_ylabel('number of unique values')\nsns.lineplot(x=unique_values['year_timestamp'][0], y=unique_values['year_timestamp'][1], ax=axs[1], label='timestamp year')\nsns.lineplot(x=unique_values['year'][0], y=unique_values['year'][1], ax=axs[1], label='film year')\naxs[1].set_xlabel('year')\naxs[1].set_ylabel('number of unique values')\naxs[1].set_xlim(2000, 2020)\naxs[1].set_xticks(np.arange(2000, 2022, 4))\nplt.show() "

In [276]:
# Drop columns that are in cat but not in lower_case_tags
lower_case_tags = [str.lower(t) for t in genome_tags['tag'].values]
cat = list(categories)
for c in cat:
    if c.lower() in lower_case_tags:
        df = df.drop(columns=[c])
        print('Dropped column', c)

display(df.head())


Dropped column Sci-Fi
Dropped column Crime
Dropped column Thriller
Dropped column Horror
Dropped column Fantasy
Dropped column Children
Dropped column War
Dropped column Western
Dropped column Mystery
Dropped column Documentary
Dropped column Animation
Dropped column Adventure
Dropped column Romance
Dropped column Drama
Dropped column Comedy
Dropped column Action
Dropped column Musical


Unnamed: 0,mean_rating,mean_timestamp,absurd,action,adaptation,adapted from:book,adventure,affectionate,allegory,art,...,visually stunning,weapons,weird,whimsical,witty,women,writers,Film-Noir,IMAX,year
0,3.893708,1153152210,0.104,0.6625,0.31675,0.286,0.89375,0.67625,0.246,0.20125,...,0.67325,0.26375,0.427,0.587,0.694,0.08925,0.14125,0,0,1995
1,3.251527,1122310117,0.15925,0.64025,0.5145,0.4845,0.976,0.12675,0.1775,0.1495,...,0.217,0.178,0.3865,0.2925,0.18725,0.13525,0.12225,0,0,1995
2,3.142028,980602256,0.11375,0.16025,0.252,0.19375,0.3215,0.0955,0.10775,0.09525,...,0.0985,0.14125,0.2435,0.13025,0.22325,0.35075,0.122,0,0,1995
3,2.853547,942460471,0.13375,0.147,0.507,0.46175,0.14875,0.1315,0.0775,0.17275,...,0.11625,0.2895,0.21975,0.14775,0.10725,0.97525,0.182,0,0,1995
4,3.058434,1004723013,0.15475,0.15575,0.28925,0.198,0.1635,0.11875,0.06975,0.09725,...,0.11875,0.12025,0.2935,0.16425,0.10475,0.40225,0.19225,0,0,1995


### Normalization

In [277]:
# TODO - fit the model to raw, scaled and standardized data and compare the performance for best results

mean_rating_column = df['mean_rating']

from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df.drop(columns=['mean_rating']))

# show normalized data
df = pd.DataFrame(normalized_data, columns=df.columns[:-1])
df['mean_rating'] = mean_rating_column
df = df.dropna()
display(df.shape)


(41589, 203)

### PCA (Principal Component Analysis)

In [278]:
""" df_PCA = df.copy()

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['mean_rating']), mean_rating_column, test_size=0.1, random_state=42)
print('X shape:',df.drop(columns=['mean_rating']).shape)
print('y shape:',mean_rating_column.shape)
print('X_train shape:',X_train.shape)
print('y_train shape:',y_train.shape)
print('X_test shape:',X_test.shape)
print('y_test shape:',y_test.shape)

pca = decomposition.PCA()
pca.fit(X_train)
print('pca.mean_:', pca.mean_)
print('pca.explained_variance_:', pca.explained_variance_)
print('pca.explained_variance_ratio_:', pca.explained_variance_ratio_)
print('pca.components_:', pca.components_)

X_train_t = pca.transform(X_train)
plt.scatter(X_train_t[:, 0], X_train_t[:, 1], c=y_train)
plt.show()
X_test_t = pca.transform(X_test)
plt.scatter(X_test_t[:, 0], X_test_t[:, 1], c=y_test)
plt.show() """


" df_PCA = df.copy()\n\nX_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['mean_rating']), mean_rating_column, test_size=0.1, random_state=42)\nprint('X shape:',df.drop(columns=['mean_rating']).shape)\nprint('y shape:',mean_rating_column.shape)\nprint('X_train shape:',X_train.shape)\nprint('y_train shape:',y_train.shape)\nprint('X_test shape:',X_test.shape)\nprint('y_test shape:',y_test.shape)\n\npca = decomposition.PCA()\npca.fit(X_train)\nprint('pca.mean_:', pca.mean_)\nprint('pca.explained_variance_:', pca.explained_variance_)\nprint('pca.explained_variance_ratio_:', pca.explained_variance_ratio_)\nprint('pca.components_:', pca.components_)\n\nX_train_t = pca.transform(X_train)\nplt.scatter(X_train_t[:, 0], X_train_t[:, 1], c=y_train)\nplt.show()\nX_test_t = pca.transform(X_test)\nplt.scatter(X_test_t[:, 0], X_test_t[:, 1], c=y_test)\nplt.show() "

## Modeling

In [279]:
# Create Linear Regression model in sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split data
df2 = df.copy()
X = df2.drop(columns=['mean_rating'])
y = df['mean_rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred = model.predict(X_test)
print('Coefficients: ', model.coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

display(y_test[:5])
display(y_pred[:5])


Coefficients:  [ 1.72665509e-01  9.00269703e-02 -3.20796324e-02  1.33438735e-01
 -6.42622497e-02  2.14736984e-03  3.03525948e-02 -7.75331504e-02
  3.16371451e-02  3.53797081e-02 -3.27407340e-01 -2.07066620e-02
 -5.98115975e-02  8.63797190e-02  1.68308458e-01  3.80451295e-02
  1.59389780e-01 -4.36577040e-02  9.01075215e-02 -7.86008385e-02
 -2.23935548e-02  1.26834520e-02  2.95585643e-01  6.52664767e-03
  4.43012390e-02  1.11396047e-02 -1.35269197e-01 -1.65363202e-01
 -2.27109142e-02  4.34621254e-03 -3.25722250e-02 -1.42527355e-01
 -3.72545562e-02  2.25593524e-02 -3.84366887e-02  4.14867364e-02
  9.92137880e-02  1.82225371e-02 -8.57630186e-02  6.06194143e-02
  2.21815265e-02  3.98767812e-02  1.69483311e-02  9.22952140e-02
  5.09983176e-02  1.27741159e-01 -2.08264965e-02 -1.64341502e-01
  3.19394174e-02 -3.05211763e-01  5.54902054e-02 -1.25858247e-02
  1.60807035e-01 -1.67663625e-01  7.45449817e-02  9.58968639e-02
 -2.37513880e-02  1.84107190e-02  1.86197748e-02 -2.84208590e-02
 -7.830606

  display(y_test[:5])


23110    3.700000
18561    3.343558
14277    2.375000
2322     3.774834
216      2.691514
Name: mean_rating, dtype: float64

array([3.05490225, 2.77550678, 3.0553784 , 3.63025326, 2.78354732])