<a href="https://colab.research.google.com/github/magikarp01/SIFNetflix/blob/master/SIF_Netflix_Dataset_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A notebook with code for predicting the quality of a movie or series. The notebook uses multiple regression techniques and neural networks in order to handle different kinds of predictive input.



In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import math
import scipy

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

In [3]:
# read dataset into dataframe
df = pd.read_excel("Netflix Dataset Latest 2021.xlsx")

In [4]:
# stratify data by movie or series, since they are judged very differently
movie_dataframe = df[df["Series or Movie"] == "Movie"].reset_index()
series_dataframe = df[df["Series or Movie"] == "Series"].reset_index()

First, a multiple regression between the genres a movie/series is part of and the IMDb score

In [5]:
# functional because want to be able to work with either movie_dataframe or series_dataframe
# returns x_data, y_data
# x_data is 2d np array of whether each movie/series has a particular genre
# y_data is 1d np array of imdb reviews
def preprocess_genre_regression(dataframe):
  genre_column = dataframe["Genre"]
  imdb_column = dataframe["IMDb Score"]
  # a list of the (string) genres
  genre_list = []

  # a list of imdb reviews for each entry
  imdb_reviews = []

  # a list of the genres for each entry
  genre_data = []
  
  # process data to handle empty/bad rows
  for i in range(len(genre_column)):
    genre_cell = genre_column[i]
    if genre_cell is not None and imdb_column[i] is not None and not math.isnan(imdb_column[i]):
        try:
            cell_genres = genre_cell.split(", ")
            genre_data.append(cell_genres)
            imdb_reviews.append(imdb_column[i])
        except AttributeError:
            continue

        for genre in cell_genres:
            if genre not in genre_list:
                genre_list.append(genre)

  print(genre_list)
  genredict = {k: v for v, k in enumerate(genre_list)}

  # x_data is a 2d array, each row is an array of binary entries
  # each binary entry corresponds to if movie/series is part of a genre 
  # 0 if no and 1 if yes
  x_data = []
  for entry in genre_data:
      entry_genres = [0]*len(genre_list)
      for genre in entry:
          entry_genres[genredict[genre]] = 1
      x_data.append(entry_genres)
  x_data = np.array(x_data)
  y_data = np.array(imdb_reviews)/10

  return x_data, y_data, genre_list


In [6]:
# Perform the multiple regression on the processed data from previous function
def train_genre_regression(x_data, y_data, test_size=0.2, random_state=101, print_output = False):
  X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=test_size, random_state=random_state)

  # creating a regression model
  model = LinearRegression()

  # fitting the model
  model.fit(X_train, y_train)

  # making predictions
  predictions = model.predict(X_test)

  # model evaluation
  print('mean_squared_error : ', mean_squared_error(y_test, predictions))
  # print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
  print(f'model R^2: {model.score(X_test, y_test)}')
  # print(f'model coefficients: {model.coef_}')

  if print_output:
      for i in range(len(y_test)):
        print(f"predicts {predictions[i]}, actual review is {y_test[i]}")

  return model

def model_output(model, genre_list):
  print(model.summary) 
  coef_dic = dict(zip(genre_list, model.coef_))
  for k, v in coef_dic.items():
      print(f"For genre {k}, the coefficient is {v}") 

In [7]:
# put it all together
def genre_regression(dataframe):
  x_data, y_data, genre_list = preprocess_genre_regression(dataframe)
  model = train_genre_regression(x_data, y_data)
  model_output(model, genre_list)
  return model

genre_regression(movie_dataframe)
# genre_regression(series_dataframe)

['Comedy', 'Romance', 'Drama', 'Crime', 'Fantasy', 'Mystery', 'Thriller', 'Short', 'Action', 'Adventure', 'Sci-Fi', 'Music', 'Family', 'Biography', 'Animation', 'War', 'History', 'Documentary', 'Horror', 'Film-Noir', 'Sport', 'Western', 'Musical', 'Reality-TV', 'Adult', 'News', 'Talk-Show']
mean_squared_error :  0.006606608489484805
model R^2: 0.16982544975353442
For genre Comedy, the coefficient is -0.01602164088416835
For genre Romance, the coefficient is -0.012631617961811675
For genre Drama, the coefficient is 0.02420954061188718
For genre Crime, the coefficient is 0.009489141835484157
For genre Fantasy, the coefficient is -0.0009438763731072829
For genre Mystery, the coefficient is 0.003640220577919371
For genre Thriller, the coefficient is -0.02632799540434819
For genre Short, the coefficient is 0.03233820605460993
For genre Action, the coefficient is -0.0168781347073299
For genre Adventure, the coefficient is -0.005170742785502887
For genre Sci-Fi, the coefficient is -0.00447804

LinearRegression()

Another model, determining whether runtime has an affect on the quality of the movie

In [8]:
def runtime_stratification(dataframe):
  runtime_column = dataframe["Runtime"]
  imdb_column = dataframe["IMDb Score"]
  
  # runtime_data is list of 4 arrays, each array has all imdb reviews for one runtime
  # 0 corresponding to <30 mins, 1 for 30-60 mins, 2 for 1-2 hour, 3 for > 2 hrs

  runtime_data = [[], [], [], []]
  # possible values in the runtime cell
  possible_runtimes = {"< 30 minutes": 0, "30-60 mins": 1, "1-2 hour": 2, "> 2 hrs": 3}
  # process data to handle empty/bad rows
  for i in range(len(runtime_column)):
    runtime_cell = runtime_column[i]
    if not pd.isnull(runtime_cell) and not pd.isnull(imdb_column[i]):
    # and runtime_cell is not None and imdb_column[i] is not None \
    # and not math.isnan(imdb_column[i]) and str(runtime_cell) != 'nan':
    # if runtime_cell is not None \
    #  and imdb_column[i] is not None and not math.isnan(imdb_column[i]):
# and not math.isnan(runtime_cell)
        try:
            runtime_data[possible_runtimes[runtime_cell]].append(imdb_column[i])
        except AttributeError:
            continue

  runtime_names = list(possible_runtimes.keys())
  for i in range(4):
    runtime_data[i] = [x for x in runtime_data[i] if np.isnan(x) == False]

  for i in range(4):
    if len(runtime_data[i]) == 0:
      print(f"No entries have runtime {runtime_names[i]}")
    else:
      print(f"For runtime {runtime_names[i]}, average IMDb score is {np.average(runtime_data[i])}, " +
            f"variance in IMDb score is {np.var(runtime_data[i])}")

  return runtime_data


In [9]:
print("Movies:")
movie_runtimes = runtime_stratification(movie_dataframe)
print()
print("Series:")
nan_runtime = runtime_stratification(series_dataframe)

for i in nan_runtime:
  print(i)

Movies:
For runtime < 30 minutes, average IMDb score is 7.076344086021504, variance in IMDb score is 0.46417158052954105
For runtime 30-60 mins, average IMDb score is 7.133088235294117, variance in IMDb score is 0.3467728157439447
For runtime 1-2 hour, average IMDb score is 6.632261768082665, variance in IMDb score is 0.7887142491200274
For runtime > 2 hrs, average IMDb score is 7.1072398190045245, variance in IMDb score is 0.5618221810130247

Series:
For runtime < 30 minutes, average IMDb score is 7.543082021541011, variance in IMDb score is 0.4634711970835687
No entries have runtime 30-60 mins
No entries have runtime 1-2 hour
No entries have runtime > 2 hrs


Let's find out how significant these results are. We can run t-tests on the different runtimes for movies (not on series, there are no different runtimes here)

In [None]:
# 4 t-tests, seeing if each runtime's imdb reviews are significantly different
# from the all runtime imdb reviews

all_reviews = movie_runtimes[0] + movie_runtimes[1] + movie_runtimes[2] + movie_runtimes[3]
ttests = [scipy.stats.ttest_ind(all_reviews, movie_runtimes[i]) for i in range(4)]


Time for something more complex. You know when you read a summary for a show/movie, and you know that it's just gonna be terrible? We could try to predict the IMDb score from the summary alone, using NLP techniques and a neural network.

In [10]:
# encode summary into a 512-dimensional vector, that preserves semantics and meaning
# save in np array
def preprocess_summary_prediction(dataframe, hub_layer):
  summary_column = dataframe["Summary"]
  imdb_column = dataframe["IMDb Score"]

  # a list of imdb reviews for each entry
  imdb_reviews = []

  # a 2d numpy array of the encoded summaries
  summary_data = []
  
  # process data to handle empty/bad rows
  for i in range(len(summary_column)):
    # print(i)
    summary_cell = summary_column[i]
    if not pd.isnull(summary_cell) and not pd.isnull(imdb_column[i]):
        try:
            # encoded_summary = hub_layer([summary_cell])
            # summary_data.append(encoded_summary)
            summary_data.append([summary_cell])
            imdb_reviews.append(imdb_column[i])
        except AttributeError:
            continue

  summary_data = np.array(summary_data)
  imdb_reviews = np.array(imdb_reviews)/10
  return summary_data, imdb_reviews

In [11]:
# preprocess data

embedding = "https://tfhub.dev/google/universal-sentence-encoder/4"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
summary_data, imdb_reviews = preprocess_summary_prediction(movie_dataframe, hub_layer)

In [12]:
print(summary_data.shape)

(6998, 1)


In [13]:
model = tf.keras.Sequential()
# model.add()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='sigmoid'))
model.add(tf.keras.layers.Dense(1, activation='relu'))
model.summary()
config = model.get_config() # Returns pretty much every information about your model
print(config["layers"][0]["config"]["batch_input_shape"]) # returns a tuple of width, height and channels

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 16)                8208      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 256,806,049
Trainable params: 256,806,049
Non-trainable params: 0
_________________________________________________________________
(None,)


In [14]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['accuracy'])

In [15]:
print(imdb_reviews)
print(summary_data)
print(summary_data.shape)

[0.58 0.74 0.67 ... 0.62 0.73 0.78]
[['When nerdy Johanna moves to London, things get out of hand when she reinvents herself as a bad-mouthed music critic to save her poverty-stricken family.']
 ['After her ex-boyfriend cons her out of a large sum of money, a former bank employee tricks a scam artist into helping her swindle him in retaliation.']
 ['An unhappily married farm worker struggling to care for her children reflects on her lost youth and the scandalous moment that cost her true love.']
 ...
 ['Computer users across the globe log onto the virtual world of Second Life. But some users lives are dramatically consumed by this alternate reality.']
 ['In an idyllic port town on Australias west coast in the summer of 1969, carefree 16-year-old Willie enjoys hanging out with his pals and wooing a beautiful singer named Rosie -- until his mom ships him back to a Catholic boarding school in Perth.']
 ['In his third show, Daniël Arends argues that good deeds are a form of self interest, 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    summary_data, imdb_reviews, test_size=.2, random_state=101)
# print(X_train.shape)
# print(X_test.shape)

X_train, X_validation, y_train, y_validation = train_test_split(
    X_train, y_train, test_size=.2, random_state=101)

print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)

y_train = np.array(y_train).astype('float32')
y_test = np.array(y_test).astype('float32')
y_validation = np.array(y_validation).astype('float32')

print(y_train.shape)
print(y_test.shape)
print(y_validation.shape)


(4478, 1)
(1120, 1)
(1400, 1)
(4478,)
(1400,)
(1120,)


In [None]:
history = model.fit(x=X_train, y=y_train,
                    epochs=5,
                    validation_data=(X_validation, y_validation),
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 21/140 [===>..........................] - ETA: 6:05 - loss: 0.4682 - accuracy: 0.0000e+00