In [1]:
# set the seed value for the notebook so results are reproducible
from numpy.random import seed
seed(42)

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
# read in the data
movies = pd.read_csv('moviesClean.csv')
movies_df = movies[["original_title", "year"]]

movies.head()

Unnamed: 0,original_title,year,genre,director,actors,budget,worlwide_gross_income,metascore,revenue_percent,Lalor Roddy,...,Robert Stromberg,Mark Andrews,Jordan Vogt-Roberts,Bradford Lewis,Dan Scanlon,Josh Cooley,Peter Sohn,Angus MacLane,Simon Kinberg,Nathan Greno
0,The Devil's Doorway,2018,Horror,Aislinn Clarke,"Lalor Roddy, Ciaran Flynn, Helena Bereen, Laur...",1.0,516660.0,48.0,51665900.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Reception,2005,"Comedy, Drama, Romance",John G. Young,"Maggie Burkwit, Chris Burmester, Darien Sills-...",5000.0,18389.0,64.0,267.78,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Following,1998,"Crime, Mystery, Thriller",Christopher Nolan,"Jeremy Theobald, Alex Haw, Lucy Russell, John ...",6000.0,48482.0,60.0,708.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,El mariachi,1992,"Action, Crime, Thriller",Robert Rodriguez,"Carlos Gallardo, Consuelo Gómez, Jaime de Hoyo...",7000.0,2040920.0,73.0,29056.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Primer,2004,"Drama, Sci-Fi, Thriller",Shane Carruth,"Shane Carruth, David Sullivan, Casey Gooden, A...",7000.0,545436.0,68.0,7691.94,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# drop text-based columns for the model
movies.drop(columns=['original_title', 'year', 'genre', 'director', 'actors',
                     'budget', 'worlwide_gross_income', 'metascore',
                     'revenue_percent' 
                       ], inplace=True)
movies.shape

(6379, 10477)

In [5]:
# from sklearn.datasets import make_classification

X = movies.drop('success', axis = 1)
y = movies['success']

y = y.values.reshape(-1, 1)

print(X.shape, y.shape)

(6379, 10476) (6379, 1)


In [6]:
# create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#   Data Pre-Processing

In [7]:
# scale the data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [8]:
# scale both training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
from tensorflow.keras.utils import to_categorical

In [10]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

  return f(**kwargs)


# Create the Model

In [12]:
# create a sequential model
from tensorflow.keras.models import Sequential

model = Sequential()

In [13]:
# add the first layer. The number of inputs must be equal to the
# number of columns
from tensorflow.keras.layers import Dense
number_inputs = 10476
number_hidden_nodes = 800
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [14]:
# add the final layer. number_classes is the number of labels to predict.
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

# Model Summary

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 800)               8381600   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1602      
Total params: 8,383,202
Trainable params: 8,383,202
Non-trainable params: 0
_________________________________________________________________


# Compile the Model

In [16]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the Model

In [17]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

Train on 4784 samples
Epoch 1/50
4784/4784 - 8s - loss: 1.3926 - accuracy: 0.6089
Epoch 2/50
4784/4784 - 7s - loss: 0.2231 - accuracy: 0.9041
Epoch 3/50
4784/4784 - 7s - loss: 0.0955 - accuracy: 0.9653
Epoch 4/50
4784/4784 - 7s - loss: 0.0581 - accuracy: 0.9829
Epoch 5/50
4784/4784 - 7s - loss: 0.0357 - accuracy: 0.9910
Epoch 6/50
4784/4784 - 7s - loss: 0.0226 - accuracy: 0.9952
Epoch 7/50
4784/4784 - 7s - loss: 0.0158 - accuracy: 0.9967
Epoch 8/50
4784/4784 - 7s - loss: 0.0104 - accuracy: 0.9971
Epoch 9/50
4784/4784 - 7s - loss: 0.0082 - accuracy: 0.9983
Epoch 10/50
4784/4784 - 7s - loss: 0.0068 - accuracy: 0.9981
Epoch 11/50
4784/4784 - 7s - loss: 0.0064 - accuracy: 0.9979
Epoch 12/50
4784/4784 - 7s - loss: 0.0058 - accuracy: 0.9985
Epoch 13/50
4784/4784 - 7s - loss: 0.0056 - accuracy: 0.9985
Epoch 14/50
4784/4784 - 7s - loss: 0.0053 - accuracy: 0.9983
Epoch 15/50
4784/4784 - 7s - loss: 0.0058 - accuracy: 0.9979
Epoch 16/50
4784/4784 - 7s - loss: 0.0047 - accuracy: 0.9983
Epoch 17/50

<tensorflow.python.keras.callbacks.History at 0x23a52221d30>

# Quantify/Test the Model

In [18]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1595/1595 - 1s - loss: 2.4867 - accuracy: 0.5906
Loss: 2.4867476334018765, Accuracy: 0.5905956029891968


# Save the Model

In [19]:
model.save('classification_neural_network.h5')

# Get data put together for Visualization

In [20]:
# per the shapes above gathered the test observation counts
encoded_predictions = model.predict_classes(X_test_scaled[:1595])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [21]:
# Testing how it looks
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test[:1595]}")

Predicted classes: [0 0 0 ... 0 0 0]
Actual Labels: [[1]
 [1]
 [0]
 ...
 [0]
 [0]
 [0]]


In [22]:
# Put the lists above for Predicted into a DF to merge together
predicted_df = pd.DataFrame(prediction_labels)
predicted_df.reset_index(inplace=True)
predicted_df2 = predicted_df.rename(columns={0:"Predicted", 'index':'key'})
predicted_df2.head()

Unnamed: 0,key,Predicted
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [23]:
# Put the lists above for Actual into a DF to merge together
actual_df = pd.DataFrame(np.concatenate(y_test[:1595]))
actual_df.reset_index(inplace=True)
actual_df2 =actual_df.rename(columns={0:"Actual", 'index':'key'})
actual_df2.head()

Unnamed: 0,key,Actual
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0


In [24]:
# Grabbing X_test2 data to merge back to the original
test_movies_df = X_test
test_movies_df.reset_index(inplace=True)
test_movies_df.reset_index(inplace=True)
test_movies_df2 = test_movies_df.rename(columns={'index':'key','level_0':'key2'}) 
test_movies_df3 = test_movies_df2[["key", "key2"]]
test_movies_df3.head()

Unnamed: 0,key,key2
0,5861,0
1,4506,1
2,4443,2
3,2054,3
4,1497,4


In [25]:
# Grabbing original movies data
movies_df.reset_index(inplace=True)
movies_df1 = movies_df.rename(columns={'index':'key2'})
# movies_df2 = movies_df1[["key2", "original_title", "year", "duration"]]
movies_df1.head()

Unnamed: 0,key2,original_title,year
0,0,The Devil's Doorway,2018
1,1,The Reception,2005
2,2,Following,1998
3,3,El mariachi,1992
4,4,Primer,2004


In [26]:
# Merging the data to get the DF to load to CSV
merged_df0 = pd.merge(movies_df1, test_movies_df3, on="key2")
merged_df1 = pd.merge(merged_df0, actual_df2, on="key")
merged_df2 = pd.merge(merged_df1, predicted_df2, on="key")
merged_df2.head()

Unnamed: 0,key2,original_title,year,key,Actual,Predicted
0,4,Primer,2004,1497,0,1
1,8,Film Geek,2005,503,1,0
2,10,Viola,2012,607,0,1
3,28,Never Been Thawed,2005,354,1,0
4,36,Virgin,2003,1508,0,0


In [27]:
merged_final_df = merged_df2[["key", "key2", "original_title", "year", "Actual", "Predicted" ]]

## Save to CSV

In [28]:
# Save to file
merged_final_df.to_csv('profit_predict_vs_actual.csv', index=False)

# New Model Build On Rating

In [29]:
# read in the data
movies2 = pd.read_csv('moviesClean.csv')
movies_df = movies2[["original_title", "year"]]
movies2.head()

Unnamed: 0,original_title,year,genre,director,actors,budget,worlwide_gross_income,metascore,revenue_percent,Lalor Roddy,...,Robert Stromberg,Mark Andrews,Jordan Vogt-Roberts,Bradford Lewis,Dan Scanlon,Josh Cooley,Peter Sohn,Angus MacLane,Simon Kinberg,Nathan Greno
0,The Devil's Doorway,2018,Horror,Aislinn Clarke,"Lalor Roddy, Ciaran Flynn, Helena Bereen, Laur...",1.0,516660.0,48.0,51665900.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Reception,2005,"Comedy, Drama, Romance",John G. Young,"Maggie Burkwit, Chris Burmester, Darien Sills-...",5000.0,18389.0,64.0,267.78,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Following,1998,"Crime, Mystery, Thriller",Christopher Nolan,"Jeremy Theobald, Alex Haw, Lucy Russell, John ...",6000.0,48482.0,60.0,708.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,El mariachi,1992,"Action, Crime, Thriller",Robert Rodriguez,"Carlos Gallardo, Consuelo Gómez, Jaime de Hoyo...",7000.0,2040920.0,73.0,29056.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Primer,2004,"Drama, Sci-Fi, Thriller",Shane Carruth,"Shane Carruth, David Sullivan, Casey Gooden, A...",7000.0,545436.0,68.0,7691.94,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Categories from: https://www.metacritic.com/about-metascores#:~:text=Metacritic%20designates%20a%20movie%20as,section%20of%20the%20best%20critics..

![title](static/theme_pics/metascore.png)

In [30]:
# Verifying top scores in order to standardize
# test_df = movies.sort_values("metascore", ascending=False)
# test_score_df = test_df['metascore']
# test_score_df.head()

In [31]:
# Calculate Revenue % Column
# movies2['success_score'] = (movies['metascore'] / 100)

# movies2.head()

# Walk down the dataframe, movie by movie to get metascore categories
for index, row in movies2.iterrows():
    
    if row['metascore'] > 60:
        movies2.loc[index, 'Favorable to Great'] = 1
    else: 
        movies2.loc[index, 'Favorable to Great'] = 0

In [32]:
movies2.head()

Unnamed: 0,original_title,year,genre,director,actors,budget,worlwide_gross_income,metascore,revenue_percent,Lalor Roddy,...,Mark Andrews,Jordan Vogt-Roberts,Bradford Lewis,Dan Scanlon,Josh Cooley,Peter Sohn,Angus MacLane,Simon Kinberg,Nathan Greno,Favorable to Great
0,The Devil's Doorway,2018,Horror,Aislinn Clarke,"Lalor Roddy, Ciaran Flynn, Helena Bereen, Laur...",1.0,516660.0,48.0,51665900.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Reception,2005,"Comedy, Drama, Romance",John G. Young,"Maggie Burkwit, Chris Burmester, Darien Sills-...",5000.0,18389.0,64.0,267.78,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Following,1998,"Crime, Mystery, Thriller",Christopher Nolan,"Jeremy Theobald, Alex Haw, Lucy Russell, John ...",6000.0,48482.0,60.0,708.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,El mariachi,1992,"Action, Crime, Thriller",Robert Rodriguez,"Carlos Gallardo, Consuelo Gómez, Jaime de Hoyo...",7000.0,2040920.0,73.0,29056.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Primer,2004,"Drama, Sci-Fi, Thriller",Shane Carruth,"Shane Carruth, David Sullivan, Casey Gooden, A...",7000.0,545436.0,68.0,7691.94,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
movies2.drop(columns=['original_title', 'year', 'genre', 'director', 'actors',
                     'budget', 'worlwide_gross_income', 'metascore', 'success',
                     'revenue_percent' 
                       ], inplace=True)
movies2.shape

(6379, 10477)

In [34]:
# from sklearn.datasets import make_classification

X2 = movies2.drop('Favorable to Great', axis = 1)
y2 = movies2['Favorable to Great']

y2 = y2.values.reshape(-1, 1)

print(X2.shape, y2.shape)

(6379, 10476) (6379, 1)


In [35]:
# create training and testing data
from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1)

In [36]:
print(X_train2.shape, X_test2.shape)

(4784, 10476) (1595, 10476)


## Data Pre-Processing

In [37]:
# scale the data
from sklearn.preprocessing import StandardScaler

X_scaler2 = StandardScaler().fit(X_train2)

In [38]:
# scale both training and testing data
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [39]:
from tensorflow.keras.utils import to_categorical

In [40]:
# One-hot encoding
y_train_categorical2 = to_categorical(y_train2)
y_test_categorical2 = to_categorical(y_test2)
y_train_categorical2

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [41]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y2)
encoded_y = label_encoder.transform(y2)


  return f(**kwargs)


## Create the Model

In [42]:
# create a sequential model
from tensorflow.keras.models import Sequential

model2 = Sequential()

In [43]:
# add the first layer. The number of inputs must be equal to the
# number of columns
from tensorflow.keras.layers import Dense
number_inputs = 10476
number_hidden_nodes = 800
model2.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [44]:
# add the final layer. number_classes is the number of labels to predict.
number_classes = 2
model2.add(Dense(units=number_classes, activation='softmax'))

## Model Summary

In [45]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 800)               8381600   
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1602      
Total params: 8,383,202
Trainable params: 8,383,202
Non-trainable params: 0
_________________________________________________________________


## Compile the Model

In [46]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Train the Model

In [47]:
# Fit (train) the model
model2.fit(
    X_train_scaled2,
    y_train_categorical2,
    epochs=50,
    shuffle=True,
    verbose=2
)

Train on 4784 samples
Epoch 1/50
4784/4784 - 8s - loss: 1.7905 - accuracy: 0.5753
Epoch 2/50
4784/4784 - 7s - loss: 0.3794 - accuracy: 0.8691
Epoch 3/50
4784/4784 - 7s - loss: 0.1167 - accuracy: 0.9634
Epoch 4/50
4784/4784 - 7s - loss: 0.0764 - accuracy: 0.9776
Epoch 5/50
4784/4784 - 7s - loss: 0.0507 - accuracy: 0.9845
Epoch 6/50
4784/4784 - 8s - loss: 0.0330 - accuracy: 0.9900
Epoch 7/50
4784/4784 - 7s - loss: 0.0241 - accuracy: 0.9933
Epoch 8/50
4784/4784 - 7s - loss: 0.0189 - accuracy: 0.9933
Epoch 9/50
4784/4784 - 7s - loss: 0.0158 - accuracy: 0.9952
Epoch 10/50
4784/4784 - 8s - loss: 0.0140 - accuracy: 0.9944
Epoch 11/50
4784/4784 - 7s - loss: 0.0116 - accuracy: 0.9956
Epoch 12/50
4784/4784 - 7s - loss: 0.0117 - accuracy: 0.9950
Epoch 13/50
4784/4784 - 7s - loss: 0.0105 - accuracy: 0.9952
Epoch 14/50
4784/4784 - 7s - loss: 0.0101 - accuracy: 0.9954
Epoch 15/50
4784/4784 - 7s - loss: 0.0095 - accuracy: 0.9956
Epoch 16/50
4784/4784 - 7s - loss: 0.0086 - accuracy: 0.9944
Epoch 17/50

<tensorflow.python.keras.callbacks.History at 0x23a521eee10>

## Quantify/Test the Model

In [48]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model2.evaluate(
    X_test_scaled2, y_test_categorical2, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1595/1595 - 1s - loss: 2.0783 - accuracy: 0.6113
Loss: 2.078326098149099, Accuracy: 0.6112852692604065


## Save the Model

In [49]:
model2.save('classification_neural_network2.h5')

## Gather Predictions and Actuals on test data for visualization

In [50]:
# per the shapes above gathered the test observation counts
encoded_predictions = model2.predict_classes(X_test_scaled2[:1595])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [51]:
# Testing how it looks
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test2[:1595]}")

Predicted classes: [1. 1. 0. ... 1. 0. 0.]
Actual Labels: [[1.]
 [1.]
 [0.]
 ...
 [0.]
 [1.]
 [0.]]


In [52]:
# Put the lists above for Predicted into a DF to merge together
predicted_df = pd.DataFrame(prediction_labels)
predicted_df.reset_index(inplace=True)
predicted_df2 = predicted_df.rename(columns={0:"Predicted", 'index':'key'})
predicted_df2.head()

Unnamed: 0,key,Predicted
0,0,1.0
1,1,1.0
2,2,0.0
3,3,1.0
4,4,1.0


In [53]:
# Put the lists above for Actual into a DF to merge together
actual_df = pd.DataFrame(np.concatenate(y_test2[:1595]))
actual_df.reset_index(inplace=True)
actual_df2 =actual_df.rename(columns={0:"Actual", 'index':'key'})
actual_df2.head()

Unnamed: 0,key,Actual
0,0,1.0
1,1,1.0
2,2,0.0
3,3,0.0
4,4,0.0


In [54]:
# Grabbing X_test2 data to merge back to the original
test_movies_df = X_test2
test_movies_df.reset_index(inplace=True)
test_movies_df.reset_index(inplace=True)
test_movies_df2 = test_movies_df.rename(columns={'index':'key','level_0':'key2'}) 
test_movies_df3 = test_movies_df2[["key", "key2"]]
test_movies_df3.head()

Unnamed: 0,key,key2
0,5861,0
1,4506,1
2,4443,2
3,2054,3
4,1497,4


In [55]:
# Grabbing original movies data
movies_df.reset_index(inplace=True)
movies_df1 = movies_df.rename(columns={'index':'key2'})
# movies_df2 = movies_df1[["key2", "original_title", "year", "duration"]]
movies_df1.head()

Unnamed: 0,key2,original_title,year
0,0,The Devil's Doorway,2018
1,1,The Reception,2005
2,2,Following,1998
3,3,El mariachi,1992
4,4,Primer,2004


In [56]:
# Merging the data to get the DF to load to CSV
merged_df0 = pd.merge(movies_df1, test_movies_df3, on="key2")
merged_df1 = pd.merge(merged_df0, actual_df2, on="key")
merged_df2 = pd.merge(merged_df1, predicted_df2, on="key")
merged_df2.head()

Unnamed: 0,key2,original_title,year,key,Actual,Predicted
0,4,Primer,2004,1497,1.0,1.0
1,8,Film Geek,2005,503,0.0,0.0
2,10,Viola,2012,607,0.0,1.0
3,28,Never Been Thawed,2005,354,1.0,0.0
4,36,Virgin,2003,1508,1.0,0.0


In [57]:
merged_final_df = merged_df2[["key", "key2", "original_title", "year", "Actual", "Predicted" ]]

## Save to CSV

In [58]:
# Save to file
merged_final_df.to_csv('metascore_predict_vs_actual.csv', index=False)