In [None]:
# set the seed value for the notebook so results are reproducible
from numpy.random import seed
seed(42)

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
# read in the data
movies = pd.read_csv('moviesClean.csv')
movies_df = movies[["original_title", "year", "duration"]]

movies.head()

In [None]:
# drop text-based columns for the model
movies.drop(columns=['original_title', 'genre', 'country', 'language', 'revenue_percent', 'budget',
                     'worlwide_gross_income', 'director', 'writer', 'production_company', 'actors'], inplace=True)
movies.shape

In [None]:
# from sklearn.datasets import make_classification

X = movies.drop('success', axis = 1)
y = movies['success']

y = y.values.reshape(-1, 1)

print(X.shape, y.shape)

In [None]:
# create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#   Data Pre-Processing

In [None]:
# scale the data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [None]:
# scale both training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

In [None]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Create the Model

In [None]:
# create a sequential model
from tensorflow.keras.models import Sequential

model = Sequential()

In [None]:
# add the first layer. The number of inputs must be equal to the
# number of columns
from tensorflow.keras.layers import Dense
number_inputs = 10481
number_hidden_nodes = 10
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [None]:
# add the final layer. number_classes is the number of labels to predict.
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

# Model Summary

In [None]:
model.summary()

# Compile the Model

In [None]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the Model

In [None]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

# Quantify/Test the Model

In [None]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Save the Model

In [None]:
model.save('classification_neural_network.h5')

# Get data put together for Visualization

In [None]:
# per the shapes above gathered the test observation counts
encoded_predictions = model.predict_classes(X_test_scaled[:1595])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
# Testing how it looks
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test[:1595]}")

In [None]:
# Put the lists above for Predicted into a DF to merge together
predicted_df = pd.DataFrame(prediction_labels)
predicted_df.reset_index(inplace=True)
predicted_df2 = predicted_df.rename(columns={0:"Predicted", 'index':'key'})
predicted_df2.head()

In [None]:
# Put the lists above for Actual into a DF to merge together
actual_df = pd.DataFrame(np.concatenate(y_test[:1595]))
actual_df.reset_index(inplace=True)
actual_df2 =actual_df.rename(columns={0:"Actual", 'index':'key'})
actual_df2.head()

In [None]:
# Grabbing X_test2 data to merge back to the original
test_movies_df = X_test
test_movies_df.reset_index(inplace=True)
test_movies_df.reset_index(inplace=True)
test_movies_df2 = test_movies_df.rename(columns={'index':'key','level_0':'key2'}) 
test_movies_df3 = test_movies_df2[["key", "key2"]]
test_movies_df3.head()

In [None]:
# Grabbing original movies data
movies_df.reset_index(inplace=True)
movies_df1 = movies_df.rename(columns={'index':'key2'})
# movies_df2 = movies_df1[["key2", "original_title", "year", "duration"]]
movies_df1.head()

In [None]:
# Merging the data to get the DF to load to CSV
merged_df0 = pd.merge(movies_df1, test_movies_df3, on="key2")
merged_df1 = pd.merge(merged_df0, actual_df2, on="key")
merged_df2 = pd.merge(merged_df1, predicted_df2, on="key")
merged_df2.head()

In [None]:
merged_final_df = merged_df2[["key", "key2", "original_title", "year", "duration", "Actual", "Predicted" ]]

## Save to CSV

In [None]:
# Save to file
merged_final_df.to_csv('profit_predict_vs_actual.csv', index=False)

In [None]:
# # Load the model
# from tensorflow.keras.models import load_model
# voice_model = load_model("classification_neural_network.h5")

# New Model Build On Rating

In [None]:
# read in the data
movies2 = pd.read_csv('moviesClean.csv')
movies_df = movies2[["original_title", "year", "duration"]]
movies2.head()

Categories from: https://www.metacritic.com/about-metascores#:~:text=Metacritic%20designates%20a%20movie%20as,section%20of%20the%20best%20critics..

![title](static/theme_pics/metascore.png)

In [None]:
# Verifying top scores in order to standardize
# test_df = movies.sort_values("metascore", ascending=False)
# test_score_df = test_df['metascore']
# test_score_df.head()

In [None]:
# Calculate Revenue % Column
# movies2['success_score'] = (movies['metascore'] / 100)

# movies2.head()

# Walk down the dataframe, movie by movie to get metascore categories
for index, row in movies2.iterrows():
    
    if row['metascore'] > 60:
        movies2.loc[index, 'Favorable to Great'] = 1
    else: 
        movies2.loc[index, 'Favorable to Great'] = 0

In [None]:
movies2.head()

In [None]:
# drop text-based columns for the model
movies2.drop(columns=['original_title', 'genre', 'country', 'language', 'revenue_percent', 'metascore',
                     'worlwide_gross_income', 'director', 'writer', 'production_company', 'actors'], inplace=True)
movies2.shape

In [None]:
# from sklearn.datasets import make_classification

X2 = movies2.drop('Favorable to Great', axis = 1)
y2 = movies2['Favorable to Great']

y2 = y2.values.reshape(-1, 1)

print(X2.shape, y2.shape)

In [None]:
# create training and testing data
from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1)

In [None]:
print(X_train2.shape, X_test2.shape)

## Data Pre-Processing

In [None]:
# scale the data
from sklearn.preprocessing import StandardScaler

X_scaler2 = StandardScaler().fit(X_train2)

In [None]:
# scale both training and testing data
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
# One-hot encoding
y_train_categorical2 = to_categorical(y_train2)
y_test_categorical2 = to_categorical(y_test2)
y_train_categorical2

In [None]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y2)
encoded_y = label_encoder.transform(y2)


## Create the Model

In [None]:
# create a sequential model
from tensorflow.keras.models import Sequential

model2 = Sequential()

In [None]:
# add the first layer. The number of inputs must be equal to the
# number of columns
from tensorflow.keras.layers import Dense
number_inputs = 10482
number_hidden_nodes = 10
model2.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [None]:
# add the final layer. number_classes is the number of labels to predict.
number_classes = 2
model2.add(Dense(units=number_classes, activation='softmax'))

## Model Summary

In [None]:
model2.summary()

## Compile the Model

In [None]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Train the Model

In [None]:
# Fit (train) the model
model2.fit(
    X_train_scaled2,
    y_train_categorical2,
    epochs=1000,
    shuffle=True,
    verbose=2
)

## Quantify/Test the Model

In [None]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model2.evaluate(
    X_test_scaled2, y_test_categorical2, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Save the Model

In [None]:
model2.save('classification_neural_network2.h5')

## Gather Predictions and Actuals on test data for visualization

In [None]:
# per the shapes above gathered the test observation counts
encoded_predictions = model2.predict_classes(X_test_scaled2[:1595])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
# Testing how it looks
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {y_test2[:1595]}")

In [None]:
# Put the lists above for Predicted into a DF to merge together
predicted_df = pd.DataFrame(prediction_labels)
predicted_df.reset_index(inplace=True)
predicted_df2 = predicted_df.rename(columns={0:"Predicted", 'index':'key'})
predicted_df2.head()

In [None]:
# Put the lists above for Actual into a DF to merge together
actual_df = pd.DataFrame(np.concatenate(y_test2[:1595]))
actual_df.reset_index(inplace=True)
actual_df2 =actual_df.rename(columns={0:"Actual", 'index':'key'})
actual_df2.head()

In [None]:
# Grabbing X_test2 data to merge back to the original
test_movies_df = X_test2
test_movies_df.reset_index(inplace=True)
test_movies_df.reset_index(inplace=True)
test_movies_df2 = test_movies_df.rename(columns={'index':'key','level_0':'key2'}) 
test_movies_df3 = test_movies_df2[["key", "key2"]]
test_movies_df3.head()

In [None]:
# Grabbing original movies data
movies_df.reset_index(inplace=True)
movies_df1 = movies_df.rename(columns={'index':'key2'})
# movies_df2 = movies_df1[["key2", "original_title", "year", "duration"]]
movies_df1.head()

In [None]:
# Merging the data to get the DF to load to CSV
merged_df0 = pd.merge(movies_df1, test_movies_df3, on="key2")
merged_df1 = pd.merge(merged_df0, actual_df2, on="key")
merged_df2 = pd.merge(merged_df1, predicted_df2, on="key")
merged_df2.head()

In [None]:
merged_final_df = merged_df2[["key", "key2", "original_title", "year", "duration", "Actual", "Predicted" ]]

## Save to CSV

In [None]:
# Save to file
merged_final_df.to_csv('metascore_predict_vs_actual.csv', index=False)