In [60]:
# import any and all necessary packages
import os
import pandas as pd
import numpy as np
import operator
from scipy.stats import randint
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV

In [2]:
# load in the dataframe
df = pd.read_csv('/content/drive/MyDrive/NLP/Final Project/processed_char_dialogue.csv')
df.drop(columns=df.columns[0], axis=1, inplace=True)
df.head()

Unnamed: 0,title,character,dialogue
0,Avatar,JAKE,"They can fix a spinal, if you've got the money..."
1,Avatar,JAKE,I became a Marine for the hardship. To be hamm...
2,Avatar,JAKE,Let's get it straight up front. I don't want y...
3,Avatar,JAKE,"You want a fair deal, you're on the wrong plan..."
4,Avatar,JAKE,It's just the way things are. And nobody does ...


In [3]:
# filter to only the top 50 characters by lines of dialogue
table = df.groupby(['character']).count()
table_50 = table.sort_values(by='dialogue', ascending=False).head(50)
df = df[df['character'].isin(table_50.index)].reset_index(drop=True)

In [4]:
# create two dictionaries to convert character names to and from their IDs
char_list = df['character'].unique()
class_ids = []
class_to_id_dict = {}
id_to_class_dict = {}
for i, char in enumerate(char_list):
  class_ids.append(i)
  class_to_id_dict[char] = i
  id_to_class_dict[i] = char

In [5]:
# map character names to IDs
df['character'] = df['character'].map(class_to_id_dict)

In [6]:
# define X and Y
X, y = df['dialogue'].values, df['character'].values

In [7]:
# Vectorize the dialogue using the TF-IDF vectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')

X = tfidf.fit_transform(X).toarray()
print("Each of the %d lines of dialogue is represented by %d features (TF-IDF score of unigrams and bigrams)" %(X.shape))

Each of the 13256 lines of dialogue is represented by 2490 features (TF-IDF score of unigrams and bigrams)


In [8]:
# fix class imbalance using SMOTE to create synthetic data
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [9]:
# Find the three most correlated unigrams and bigrams with each of the characters
N = 3
for Class, ID in sorted(class_to_id_dict.items()):
  features_chi2 = chi2(X, y == ID)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("===> %s:" %(Class))
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))

===> ALAN:
  * Most Correlated Unigrams are: tracy, vick, doug
  * Most Correlated Bigrams are: chastity smiles, vick nods, oh god
===> ANAKIN:
  * Most Correlated Unigrams are: mom, padme, master
  * Most Correlated Bigrams are: don don, obi wan, sorry master
===> ANNA:
  * Most Correlated Unigrams are: olaf, kristoff, elsa
  * Most Correlated Bigrams are: north mountain, time forever, frozen lee
===> BATMAN:
  * Most Correlated Unigrams are: lucius, alfred, bane
  * Most Correlated Bigrams are: trying kill, wouldn want, bruce wayne
===> BELLA:
  * Most Correlated Unigrams are: prom, mom, edward
  * Most Correlated Bigrams are: trying kill, sounds like, did know
===> BENDER:
  * Most Correlated Unigrams are: question, genius, claire
  * Most Correlated Bigrams are: old man, guy like, yeah got
===> C-3PO:
  * Most Correlated Unigrams are: chewbacca, goodness, artoo
  * Most Correlated Bigrams are: princess leia, oh dear, master luke
===> CARL:
  * Most Correlated Unigrams are: carl, el

In [9]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state = 0)

In [11]:
# define 5 different models to build and test
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    CalibratedClassifierCV(LinearSVC()),
    MultinomialNB(),
    LogisticRegression(random_state=0, max_iter=1000),
]
# perform 5 fold cross-validation on each model
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [12]:
# view the accuracies of each fold
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.180285
1,RandomForestClassifier,1,0.201946
2,RandomForestClassifier,2,0.192477
3,RandomForestClassifier,3,0.202594
4,RandomForestClassifier,4,0.196368
5,CalibratedClassifierCV,0,0.501816
6,CalibratedClassifierCV,1,0.528275
7,CalibratedClassifierCV,2,0.59572
8,CalibratedClassifierCV,3,0.60013
9,CalibratedClassifierCV,4,0.613489


In [10]:
# once more, fit the data to the highest performing model
model = CalibratedClassifierCV(LinearSVC())
trained_model = model.fit(X_train, y_train)

In [43]:
import pickle

# Save the model to a file using Pickle
with open('char_class_model.pkl', 'wb') as f:
    pickle.dump(trained_model, f)