# Imports & Getting the data

In [97]:
from random import gauss as gs, uniform as uni, seed
import numpy as np
import pandas as pd
from datetime import datetime

import statistics as stats
from sklearn.naive_bayes import LabelBinarizer
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

In [98]:
movies = pd.read_csv('Data/movies.csv')
ratings = pd.read_csv('Data/ratings.csv')
tags = pd.read_csv('Data/tags.csv')

## Examining & Cleaning the Data!

The movie and rating CSVs are fairly self-explainatory (providing movies, reviews, and users), so we'll start there.

### Movies

In [99]:
#Problems: Year is tied into the title, no reviews, genres are a pipe separated list, ids are present, but not the indexes 
movies.set_index('movieId', inplace=True)
movies.sample(5)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
118862,Closer to the Moon (2013),Comedy|Drama
5588,"Hills Have Eyes, The (1977)",Horror
8730,To End All Wars (2001),Action|Drama|War
436,Color of Night (1994),Drama|Thriller
3726,Assault on Precinct 13 (1976),Action|Thriller


In [100]:
#Get the year from title
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True).astype(str)
movies['title'] = movies.title.str[:-7]

In [101]:
#Add the mean rating for each movie 
ratings_movie_mean = ratings.groupby('movieId').mean()
mean_rating = ratings.groupby('movieId').mean()['rating']
movies['avg_rating'] = mean_rating
movies['num_reviews'] = ratings.groupby('movieId').count()['rating']

In [102]:
#make the genres a csv
def unpack_genres(string):
    s = string.split('|')
    return s

movies['genres'] = movies['genres'].map(unpack_genres)

In [103]:
#looks good! 
movies

Unnamed: 0_level_0,title,genres,year,avg_rating,num_reviews
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,3.920930,215.0
2,Jumanji,"[Adventure, Children, Fantasy]",1995,3.431818,110.0
3,Grumpier Old Men,"[Comedy, Romance]",1995,3.259615,52.0
4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,2.357143,7.0
5,Father of the Bride Part II,[Comedy],1995,3.071429,49.0
...,...,...,...,...,...
193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,4.000000,1.0
193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,3.500000,1.0
193585,Flint,[Drama],2017,3.500000,1.0
193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,3.500000,1.0


### Ratings 

In [104]:
#problem: timestamp not human readable 
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
4921,31,1353,4.0,850467078
2161,18,78499,4.0,1456744760
91765,596,533,3.0,1535721426
45490,299,3578,5.0,974620300
28907,199,33493,2.5,1119190953


In [105]:
#times now datetime 
ratings['timestamp'] = ratings['timestamp'].map(datetime.fromtimestamp)

In [106]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836
mean,326.127564,19435.295718,3.501557,2008-03-19 12:38:29.931839488
min,1.0,1.0,0.5,1996-03-29 13:36:55
25%,177.0,1199.0,3.0,2002-04-18 05:57:46
50%,325.0,2991.0,3.5,2007-08-02 16:31:02
75%,477.0,8122.0,4.0,2015-07-04 03:15:44.500000
max,610.0,193609.0,5.0,2018-09-24 10:27:30
std,182.618491,35530.987199,1.042529,


In [107]:
plot_df = ratings.groupby(by="rating").count().reindex()
plot_df = plot_df.rename(columns={"userId": "count"})
plot_df

Unnamed: 0_level_0,count,movieId,timestamp
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.5,1370,1370,1370
1.0,2811,2811,2811
1.5,1791,1791,1791
2.0,7551,7551,7551
2.5,5550,5550,5550
3.0,20047,20047,20047
3.5,13136,13136,13136
4.0,26818,26818,26818
4.5,8551,8551,8551
5.0,13211,13211,13211


In [185]:

fig = px.histogram(ratings, x='rating', 
                            text_auto=True,
                            labels={'x':'rating', 'y':'count'},
                            color='rating',
                            color_discrete_sequence=["SandyBrown", "Gold", 'Salmon', 'Coral', "OrangeRed", 'Gold', "SandyBrown", 'Salmon', "OrangeRed", 'Coral'],
                            )

fig.update_layout(
    title_text='Number of Ratings', # title of plot
    xaxis_title_text='Rating', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1, # gap between bars of the same location coordinates
)

fig.show()