# Loading, Enhancing and Visualizing the Netflix dataset

In [2]:
import pandas as pd
import numpy as np

from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

import gc # this will be usefull to free-up some memory after the pre-processing is done

py.init_notebook_mode(connected=True)

### Loading the review data

Let's start this by trying to load the dataset provided by Kaggle. On the description of the dataset (available [here](https://www.kaggle.com/netflix-inc/netflix-prize-data/version/1)), we learn that the review data is split between four files named `combined_data_{1, 2, 3, 4}.txt`, each of them formatted in the following way:

```
MovieId1:                  -- Header   \ 
UserId11,Rating11,Date11   \            | Movie Block
UserId12,Rating12,Date12    | Reviews   |
...                        /           /
MovieId2:
UserId21,Rating21,Date21
UserId22,Rating22,Date22
...
```

As you can see, this is not a simple CSV. We will need to transform this file into a datasets suitable for analysis.

In [2]:
# Let's start by loading the four parts of the dataset into a single dataframe
headers = ['user_id','rating','date']
reviews = pd.read_csv('../dataset/combined_data_1.txt', names=headers)

reviews_part = pd.read_csv('../dataset/combined_data_2.txt', names=headers)
reviews = reviews.append(reviews_part, ignore_index=True)

reviews_part = pd.read_csv('../dataset/combined_data_3.txt', names=headers)
reviews = reviews.append(reviews_part, ignore_index=True)

reviews_part = pd.read_csv('../dataset/combined_data_4.txt', names=headers)
reviews = reviews.append(reviews_part, ignore_index=True)

print('Found {} rows in the Netflix dataset.'.format(len(df)))

Found 100498277 rows in the Netflix dataset.


Alright, we now have a large dataset (over 100M rows!) containing all of our reviews, together with each of the header rows containing the movie identifier of each movie block.

Our goal will be to remove the header rows from our dataset and instead add a `movie_id` column to our dataset to match each review to the movie being rated.

In [3]:
# We can start by finding the header rows present in the dataset. An example of such a row can
# be found at the very beginning of the dataset,].
reviews.iloc[0]

user_id     1:
rating     NaN
date       NaN
Name: 0, dtype: object

In [4]:
# We can thus find each header row by selecting rows from our dataframe with a NaN rating.
header_rows = reviews[reviews['rating'].isna()]
print('The Netflix dataset contains {} movies'.format(len(header_rows)))

The Netflix dataset contains 17770 movies


In [5]:
# Alright, we can now use the indices of the header rows to match reviews to their movies.

# Start by initializing the movie_id column of the dataframe.
reviews['movie_id'] = np.nan

num_movies = len(header_rows)
for i in range(num_movies - 1):
    # Extract the movie id, removing the trailing ':' and parsing it as a number
    movie_id = int(header_rows['user_id'].iloc[i][:-1])
    
    # Compute the index range for the reviews of this block
    block_from = header_rows.index[i] + 1
    block_to = header_rows.index[i+1]
    
    # Assign the movie id to the reviews in the block
    reviews.loc[block_from:block_to, 'movie_id'] = movie_id
    
# Finally, we can filter out the header rows from our dataset
reviews = reviews[~reviews['rating'].isna()]

### Loading the movie data

Additionally to the reviews, the dataset provides a file with information about each of the movie. It is a CSV file where each row represents a movie with the following columns: `movie_id`, `release_year`, `name`.

We will now load the dataset and enrich it with features generated from the ratings data.

In [128]:
# We start by simply loading the CSV into a new dataframe
movies = pd.read_csv('../dataset/movie_titles.csv',
                     encoding = 'ISO-8859-1',
                     names = ['movie_id', 'release_year', 'name'])
movies.index = movies['movie_id'];

In [141]:
# We can then add useful features for each movies
reviews_by_movie = reviews.groupby('movie_id')['rating'].agg(['count', 'mean', 'min', 'max', 'var'])

# How reviews were given to the movie: information about the popularity of the movie
movies['reviews_count'] = reviews_by_movie['count']

# What is the mean rating for the movie: information about the quality of the movie
movies['mean_rating'] = reviews_by_movie['mean']

# What is the variance in the ratings of this movie: additional information about the quality of the movie
movies['var_rating'] = reviews_by_movie['var']

### Saving our processed data

We can now save our processed data before moving on to the next section. This will allow us later to simply load the data if needed without having to go through the pre-processing again.

In [143]:
# Before moving to exploratory analysis, we first save our processed data for further analysis
reviews.to_parquet(fname='../dataset/reviews.parquet.gzip', compression='gzip')
movies.to_parquet(fname='../dataset/movies.parquet.gzip', compression='gzip')

# and free-up some memory
gc.collect()

0

### Exploring the data

Now that the data is loaded, we are ready to explore our data and hopefully gain some insights.

In [3]:
# If you have already prepared the data in an early session, uncomment the next line and run this cell 
# to load the reviews dataframe

# reviews = pd.read_parquet('../dataset/reviews.parquet.gzip')
# movies = pd.read_parquet('../dataset/movies.parquet.gzip')

In [77]:
# movies_by_year = movies.groupby('release_year')['reviews_count'].agg(['count'])

fig = tools.make_subplots(rows=2, cols=2, print_grid=False)

for i,key in enumerate(['release_year', 'reviews_count', 'mean_rating', 'var_rating']):
    hist = go.Histogram(x=movies[key],marker=dict(color='rgba(152, 0, 0, .8)'))
    fig.append_trace(hist, i//2 + 1, i%2 + 1)

fig['layout'].update(dict(
    title='Histrograms for different movie attributes',
    showlegend=False,
    height=600, width=600,
    xaxis1=dict(title='Year of release'),
    xaxis2=dict(title='Number of ratings'),
    xaxis3=dict(title='Mean rating'),
    xaxis4=dict(title='Variance of ratings'),
    yaxis=dict(title='Number of movies'),
    yaxis3=dict(title='Number of movies')
))

py.iplot(fig)

In [51]:
layout = go.Layout(
    title='Does the rating seem to correlate with the year of release?',
    xaxis=dict(title='Year of release'),
    yaxis=dict(title='Rating'),
    hovermode= 'closest'
)

scatter = go.Scatter(
    x=movies['release_year'],
    y=movies['mean_rating'],
    mode='markers',
    marker = dict(
        size = 2,
        color = 'rgba(152, 0, 0, .8)'
    ),
    text=movies['name']
)

fig = go.Figure(data=[scatter], layout=layout)
py.iplot(fig)

In [52]:
layout = go.Layout(
    title='Does the rating seem to correlate with the number of reviews?',
    xaxis=dict(title='Number of reviews'),
    yaxis=dict(title='Rating'),
    hovermode= 'closest'
)

scatter = go.Scatter(
    x=movies['reviews_count'],
    y=movies['mean_rating'],
    mode='markers',
    marker = dict(
        size = 2,
        color = 'rgba(152, 0, 0, .8)'
    ),
    text=movies['name']
)

fig = go.Figure(data=[scatter], layout=layout)
py.iplot(fig)

In [90]:
reviews_by_user = reviews.groupby('user_id').count().groupby('rating')['rating'].agg(['count'])

layout = go.Layout(
    title='Number of users per number of reviews given',
    xaxis=dict(title='Number of reviews given'),
    yaxis=dict(title='Number of users')
)

hist = go.Bar(
    x=reviews_by_user.index,
    y=reviews_by_user['count'],
    marker=dict(color='rgb(152, 0, 0)')
)

fig = go.Figure(data=[hist], layout=layout)
py.iplot(fig)

In [77]:
reviews_by_rating = reviews.groupby('rating')['user_id'].agg(['count'])

layout = go.Layout(
    title='Number of reviews per rating',
    xaxis=dict(title='Rating'),
    yaxis=dict(title='Number of reviews')
)

hist = go.Bar(
    x=reviews_by_rating.index,
    y=reviews_by_rating['count'],
    marker=dict(color='rgba(152, 0, 0, 0.8)')
)

fig = go.Figure(data=[hist], layout=layout)
py.iplot(fig)