# Code to Fine-Tune/Create Traditional Models
## This file aims to produce a fine-tuned Logistic Regression, Naive Bayes and Random Forest models that can predict Rotton Tomatoes Scores based on movie scripts.
### Produced by Meghan O'Keefe, Lily Scott, Daisy Li

In [3]:
# library imports
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
import autograd.numpy as np
from autograd import grad 
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import nltk

# MULTICLASS CLASSIFICATION

## CLEAN AND VECTORIZE DATA

In [2]:
# Load file
df = pd.read_csv('modified_all_rt_scores.csv')

# Read the scripts in and add to df, discarding those that do not have a correlating script.
script_texts = []
for title in df['IMSDB_Title']:
    script_found = False
    for folder in ['Saved_Scripts_Raw/scripts-1', 'Saved_Scripts_Raw/scripts-2']:
        try:
            with open(f'{folder}/{title}.txt', 'r', encoding='utf-8') as file:
                script_texts.append(file.read())
                script_found = True
                break  
        except FileNotFoundError:
            continue 
    if not script_found:
        script_texts.append(None)

df['Script'] = script_texts

# Eliminate Null entries.
df = df.dropna(subset=['Script'])

# We used several tools to try and optimize our approach: standarizing all text to lowercase, 
# removing stop words and punctuation, and implementing a stemming approach.
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()  # Stemming
    return ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

# Pre-process all the scripts to narrow the computational time required
df['Processed_Script'] = df['Script'].apply(preprocess_text)

# Match the categories to class numbers so they are not in string form
category_to_label = {'rotten': 0, 'fresh': 1, 'certified fresh': 2}
df['CriticScoreLabel'] = df['CriticScoreCategory'].map(category_to_label)


In [7]:
# Check that df has values expected
df

Unnamed: 0.1,Unnamed: 0,IMSDB_Title,RT_Title,CriticScore,AudienceScore,CriticScoreCategory,Script,Processed_Script,CriticScoreLabel
1,2,alien-3,ALIEN-3,48.0,47.0,rotten,\n\t\t\tAlien III\n\n\t\tScreenplay by John Fa...,alien iii screenplay john fasano stori vincent...,0
2,4,american-milkshake,MILKSHAKE,0.0,42.0,rotten,\n\n\n\n\n\n\n\n\n ...,american milkshak written david andalman cowri...,0
3,6,american-werewolf-in-london,AN-AMERICAN-WEREWOLF-IN-LONDON,89.0,85.0,certified fresh,"\n\n\n\n\n""An American Werewolf in London"" -- ...",american werewolf london john landi fade 1 man...,2
4,10,austin-powers---international-man-of-mystery,AUSTIN-POWERS:-INTERNATIONAL-MAN-OF-MYSTERY,73.0,77.0,fresh,\n\n\n\n\n\n\nAustin Powers: International Man...,austin power intern man mysteri mike myer aust...,1
5,11,austin-powers---the-spy-who-shagged-me,AUSTIN-POWERS:-THE-SPY-WHO-SHAGGED-ME,53.0,71.0,rotten,\n\n\n\n\n\n\nAUSTIN POWERS: THE SPY WHO SHAGG...,austin power spi shag austin power spi shag mi...,0
...,...,...,...,...,...,...,...,...,...
1269,1332,Yes-Man,Yes-Man,46.0,66.0,rotten,\n \n\n \n\n \n\n ...,ye man written nichola stoller base book ye ma...,0
1270,1333,You-Can-Count-On-Me,You-Can-Count-On-Me,95.0,88.0,certified fresh,"\n ""YOU CAN C...",count screenplay kenneth lonergan shoot draft ...,2
1273,1336,Zero-Dark-Thirty,Zero-Dark-Thirty,91.0,80.0,certified fresh,\n\n\n\n\n \n\n ...,zero dark thirti written mark boal octob 3rd 2...,2
1274,1337,Zerophilia,Zerophilia,25.0,61.0,rotten,\n\n\n\n\n\n\n\n\n ...,zerophilia written martin curland revis march ...,0


In [11]:
# Split the data into training/testing (X, Y)
X = df['Processed_Script']
Y = df['CriticScoreLabel']

# Found that 0.33 was the most successful split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Create a pipeline with TfidfVectorizer and choosen model (here we use LogisticRegression, can replace with MultinomialNB or RandomForestClassifier)
pipeline_rf = Pipeline([
    ('tfidf', CountVectorizer(ngram_range=(1,4), min_df = 5)),
    ('lr', LogisticRegression(C=0.03359818286283781, max_iter=5000, solver='liblinear', multi_class='ovr'))
])

# Train the model
pipeline_rf.fit(X_train, Y_train)

# Create predictions
Y_pred = pipeline_rf.predict(X_test)

# Evaluation, print classification report
print(classification_report(Y_test, Y_pred))


              precision    recall  f1-score   support

           0       0.53      0.42      0.47       106
           1       0.17      0.12      0.14        40
           2       0.54      0.68      0.60       126

    accuracy                           0.50       272
   macro avg       0.41      0.41      0.40       272
weighted avg       0.48      0.50      0.48       272

