In [63]:
import pandas as pd
import re
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support
from collections import defaultdict
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [64]:
bbt_bert_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Big Bang Theory/test_BBT_Hidden.csv')
bbt_bert_test

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
0,0,"What do you see, what do you see.",LEONARD,SHELDON
1,1,"So, what happens next?",OTHERS,LEONARD
2,2,"Okay. 1, 2, 1, 2, 3 and…",HOWARD,LEONARD
3,3,"Sheldon, six bucks.",LEONARD,LEONARD
4,4,But it’s the safest crib you can buy. And if y...,HOWARD,RAJ
...,...,...,...,...
4613,4613,What?,LEONARD,LEONARD
4614,4614,Okay. How about that one.,RAJ,LEONARD
4615,4615,Hey.,OTHERS,LEONARD
4616,4616,"Okay, okay. Guys, what are we doing here?",LEONARD,PENNY


In [65]:
precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, average='macro')

(0.361348206637943, 0.329852267024663, 0.31654807778691213, None)

In [66]:
precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, average='weighted')

(0.3747828443918449, 0.3852317020355132, 0.3511515449455602, None)

In [67]:
names={0:'SHELDON', 1:'LEONARD', 2:'PENNY', 3:'HOWARD',4:'RAJ',5:"OTHERS"}
names_rev={'SHELDON':0, 'LEONARD':1, 'PENNY':2, 'HOWARD':3,'RAJ':4,"OTHERS":5}

In [68]:
models = ['DistilBERT', 'Logistic Regression', 'Naive Bayes', 'Support Vector Machine']

In [69]:
bbt_names_keys = list(names_rev.keys())

In [70]:
def print_none_prf1(bert_test):
  res = precision_recall_fscore_support(bert_test.labels, bert_test.predicted_label, labels=list(names_rev.keys()), average=None)
  class_count = len(res[0])
  print('class  precision  recall  f1_score')
  for i in range(class_count):
    print(f'{names[i]}  {res[0][i]}  {res[1][i]}  {res[2][i]}')

In [71]:
precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, labels=list(names_rev.keys()), average=None)

(array([0.47924957, 0.30803907, 0.35849057, 0.30693069, 0.30456853,
        0.41081081]),
 array([0.74077329, 0.42931937, 0.35897436, 0.21830986, 0.13186813,
        0.09986859]),
 array([0.58198136, 0.35870516, 0.3587323 , 0.25514403, 0.18404908,
        0.16067653]),
 array([1138,  955,  741,  568,  455,  761]))

In [72]:
print_none_prf1(bbt_bert_test)

class  precision  recall  f1_score
SHELDON  0.47924957362137577  0.7407732864674869  0.5819813600276148
LEONARD  0.3080390683696469  0.4293193717277487  0.35870516185476814
PENNY  0.3584905660377358  0.358974358974359  0.358732299393122
HOWARD  0.3069306930693069  0.21830985915492956  0.2551440329218107
RAJ  0.30456852791878175  0.13186813186813187  0.18404907975460125
OTHERS  0.41081081081081083  0.09986859395532194  0.160676532769556


In [73]:
def print_none_prf1(trad_test):
  res = precision_recall_fscore_support(trad_test.label, trad_test.predict_label, labels=list(names_rev.keys()), average=None)
  class_count = len(res[0])
  print('class  precision  recall  f1_score')
  for i in range(class_count):
    print(f'{names[i]}  {res[0][i]}  {res[1][i]}  {res[2][i]}')

In [74]:
bbt_tfidf_lr_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/Baseline_Results/BBT_LR_TFIDF.csv')
bbt_tfidf_lr_test

Unnamed: 0.1,Unnamed: 0,dialogue,label,predict_label
0,0,the… picture’s breaking,RAJ,HOWARD
1,1,sharp thanks machine saw earlier,PENNY,SHELDON
2,2,go oh,HOWARD,LEONARD
3,3,keep thinking cool would call mom tell get ten...,LEONARD,OTHERS
4,4,sigma particle,SHELDON,SHELDON
...,...,...,...,...
5995,5995,five movie two hour apiece it’s start,LEONARD,LEONARD
5996,5996,okay argue,PENNY,HOWARD
5997,5997,fun story meanwhile still don’t car,HOWARD,SHELDON
5998,5998,dragon fall sky crash volcano,HOWARD,HOWARD


In [75]:
precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, average='weighted')

(0.35750248030313103, 0.37333333333333335, 0.33670217764447685, None)

In [76]:
print_none_prf1(bbt_tfidf_lr_test)

class  precision  recall  f1_score
SHELDON  0.4380358534684334  0.7200512491992312  0.5447055972861643
LEONARD  0.3020774845592364  0.4125766871165644  0.3487844408427877
PENNY  0.37910798122065725  0.3340227507755946  0.35514018691588783
HOWARD  0.30952380952380953  0.11772315653298836  0.17057169634489222
RAJ  0.3719512195121951  0.1026936026936027  0.16094986807387862
OTHERS  0.30029154518950435  0.1285892634207241  0.18006993006993008


In [77]:
bbt_tfidf_nb_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/Baseline_Results/BBT_NB_TFIDF.csv')
bbt_tfidf_nb_test

Unnamed: 0.1,Unnamed: 0,dialogue,label,predict_label
0,0,the… picture’s breaking,RAJ,OTHERS
1,1,sharp thanks machine saw earlier,PENNY,SHELDON
2,2,go oh,HOWARD,LEONARD
3,3,keep thinking cool would call mom tell get ten...,LEONARD,SHELDON
4,4,sigma particle,SHELDON,SHELDON
...,...,...,...,...
5995,5995,five movie two hour apiece it’s start,LEONARD,LEONARD
5996,5996,okay argue,PENNY,SHELDON
5997,5997,fun story meanwhile still don’t car,HOWARD,SHELDON
5998,5998,dragon fall sky crash volcano,HOWARD,HOWARD


In [78]:
precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, average='weighted')

(0.3980827626292342, 0.36616666666666664, 0.3067919224532932, None)

In [79]:
print_none_prf1(bbt_tfidf_nb_test)

class  precision  recall  f1_score
SHELDON  0.410507880910683  0.7508007687379885  0.5307971014492754
LEONARD  0.2933208780943484  0.4815950920245399  0.36458635703918724
PENNY  0.382183908045977  0.2750775594622544  0.3199037883343356
HOWARD  0.3902439024390244  0.062095730918499355  0.10714285714285714
RAJ  0.6216216216216216  0.03872053872053872  0.07290015847860538
OTHERS  0.40540540540540543  0.0749063670411985  0.12644889357218125


In [80]:
bbt_tfidf_svm_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/Baseline_Results/BBT_SVM_TFIDF.csv')
bbt_tfidf_svm_test

Unnamed: 0.1,Unnamed: 0,dialogue,label,predict_label
0,0,the… picture’s breaking,RAJ,SHELDON
1,1,sharp thanks machine saw earlier,PENNY,SHELDON
2,2,go oh,HOWARD,LEONARD
3,3,keep thinking cool would call mom tell get ten...,LEONARD,HOWARD
4,4,sigma particle,SHELDON,SHELDON
...,...,...,...,...
5995,5995,five movie two hour apiece it’s start,LEONARD,LEONARD
5996,5996,okay argue,PENNY,PENNY
5997,5997,fun story meanwhile still don’t car,HOWARD,SHELDON
5998,5998,dragon fall sky crash volcano,HOWARD,SHELDON


In [81]:
precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, average='weighted')

(0.3746431838659545, 0.37266666666666665, 0.3218610330788008, None)

In [82]:
print_none_prf1(bbt_tfidf_svm_test)

class  precision  recall  f1_score
SHELDON  0.4124391938846421  0.7604099935938501  0.5348051362919577
LEONARD  0.30267379679144385  0.4340490797546012  0.35664776307498425
PENNY  0.3934640522875817  0.3112719751809721  0.3475750577367206
HOWARD  0.3651685393258427  0.08408796895213454  0.13669821240799157
RAJ  0.43902439024390244  0.06060606060606061  0.10650887573964497
OTHERS  0.3568281938325991  0.10112359550561797  0.15758754863813226


In [83]:
frnd_bert_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Friends Data/test_Friends_Hidden.csv')
frnd_bert_test

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
0,0,"No, I didn’t!",JOEY,JOEY
1,1,Go get ‘em Treeger.,JOEY,JOEY
2,2,Hey!,JOEY,JOEY
3,3,Yeah. I think you have to draw him out. And th...,MONICA,ROSS
4,4,What the hell is that dog doing here?!,MONICA,JOEY
...,...,...,...,...
5120,5120,"Hey, cut him some slack. It was Chandler's idea.",JOEY,JOEY
5121,5121,Ooh! That’s good! Wow! But now if you were pre...,PHOEBE,PHOEBE
5122,5122,"Yeah, you have to go fight for her!",MONICA,JOEY
5123,5123,Oh I a lot of stuff!,ROSS,JOEY


In [84]:
precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, average='macro')

(0.29754261962078654, 0.2938889985588247, 0.2896304470682044, None)

In [85]:
precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, average='weighted')

(0.2988847570053169, 0.29521951219512194, 0.2910958918870876, None)

In [86]:
names={0:'MONICA', 1:'JOEY', 2:'CHANDLER', 3:'PHOEBE',4:'RACHEL',5:'ROSS',6:'Others'}
names_rev={'MONICA':0, 'JOEY':1, 'CHANDLER':2, 'PHOEBE':3,'RACHEL':4,'ROSS':5,'Others':6}

In [87]:
frnd_names_keys = list(names_rev.keys())

In [88]:
frnd_names_keys

['MONICA', 'JOEY', 'CHANDLER', 'PHOEBE', 'RACHEL', 'ROSS', 'Others']

In [89]:
def print_none_prf1(bert_test):
  res = precision_recall_fscore_support(bert_test.labels, bert_test.predicted_label, labels=list(names_rev.keys()), average=None)
  class_count = len(res[0])
  print('class  precision  recall  f1_score')
  for i in range(class_count):
    print(f'{names[i]}  {res[0][i]}  {res[1][i]}  {res[2][i]}')

In [90]:
print_none_prf1(frnd_bert_test)

class  precision  recall  f1_score
MONICA  0.2860520094562648  0.327027027027027  0.3051702395964691
JOEY  0.266025641025641  0.46111111111111114  0.33739837398373984
CHANDLER  0.2982456140350877  0.22941970310391363  0.2593440122044241
PHOEBE  0.3167808219178082  0.2811550151975684  0.29790660225442833
RACHEL  0.33236994219653176  0.2787878787878788  0.30323005932762026
ROSS  0.3223593964334705  0.2952261306532663  0.3081967213114754
Others  0.26096491228070173  0.18449612403100776  0.2161671207992734


In [91]:
data_clean = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/Datasets/Preprocessed Data/Friends/data_clean.csv')
data_clean.sample(frac=1)
data_clean

Unnamed: 0.1,Unnamed: 0,Episodes,Episode_Names,Characters,Lines,Season
0,0,101,Monica Gets A Roommate,MONICA,there's nothing tell he's guy work,1
1,1,101,Monica Gets A Roommate,JOEY,c'mon go guy there's gotta something wrong,1
2,2,101,Monica Gets A Roommate,CHANDLER,right joey nice hump hump hairpiece,1
3,3,101,Monica Gets A Roommate,PHOEBE,wait eat chalk,1
4,4,101,Monica Gets A Roommate,PHOEBE,cause want go go carl oh,1
...,...,...,...,...,...,...
49734,51761,1017-1018,"The Last One, Part I & II",CHANDLER,oh gonna okay,10
49735,51762,1017-1018,"The Last One, Part I & II",RACHEL,guy go new house right away time,10
49736,51763,1017-1018,"The Last One, Part I & II",MONICA,get time,10
49737,51764,1017-1018,"The Last One, Part I & II",RACHEL,okay get coffee,10


In [92]:
threshold = 7
names={0:'MONICA', 1:'JOEY', 2:'CHANDLER', 3:'PHOEBE',4:'RACHEL',5:'ROSS',6:'Others'}
names_rev={'MONICA':0, 'JOEY':1, 'CHANDLER':2, 'PHOEBE':3,'RACHEL':4,'ROSS':5,'Others':6}

data_thres = data_clean
data_thres['Characters']= data_clean['Characters'].apply(lambda x: 'Others' if (x not in names.values()) else x)

data_thres['target'] = data_thres.apply(lambda x: names_rev.get(x.Characters),axis=1)
data_thres[:10]

Unnamed: 0.1,Unnamed: 0,Episodes,Episode_Names,Characters,Lines,Season,target
0,0,101,Monica Gets A Roommate,MONICA,there's nothing tell he's guy work,1,0
1,1,101,Monica Gets A Roommate,JOEY,c'mon go guy there's gotta something wrong,1,1
2,2,101,Monica Gets A Roommate,CHANDLER,right joey nice hump hump hairpiece,1,2
3,3,101,Monica Gets A Roommate,PHOEBE,wait eat chalk,1,3
4,4,101,Monica Gets A Roommate,PHOEBE,cause want go go carl oh,1,3
5,5,101,Monica Gets A Roommate,MONICA,okay everybody relax even date two people go d...,1,0
6,6,101,Monica Gets A Roommate,CHANDLER,sound like date,1,2
7,7,101,Monica Gets A Roommate,CHANDLER,alright i'm back high school i'm stand middle ...,1,2
8,8,101,Monica Gets A Roommate,Others,oh yeah dream,1,6
9,9,101,Monica Gets A Roommate,CHANDLER,look realize there's phone,1,2


In [93]:
# split the clean text in the rows  into list of words
tokenized_data = data_thres["Lines"].apply(lambda text: re.split(' ',text))

target_thres = data_thres['target'] 
my_tags = data_thres['Characters'].unique()
my_tags

array(['MONICA', 'JOEY', 'CHANDLER', 'PHOEBE', 'Others', 'ROSS', 'RACHEL'],
      dtype=object)

In [94]:
# Create a BoW with Count Vectorizer 


def count_vectorizer(data):
    vectorizer = CountVectorizer(ngram_range = (1,2), min_df=10)
    # call `fit` to build the vocabulary
    vectorizer.fit(data)

    # call `transform` to convert text to a bag of words
    count_vectorizer_result  = vectorizer.transform(data)

    #convert to a numpy array to visualize as dataframe
    count_vectorizer_result = count_vectorizer_result.toarray()
    count_vectorizer_features = pd.DataFrame(count_vectorizer_result , columns = vectorizer.get_feature_names())
    count_vectorizer_features.index = data.index
    return count_vectorizer_features

count_vectorizer_df = count_vectorizer(data_thres['Lines'])
count_vectorizer_df.shape



(49739, 4406)

In [95]:
Count_vectorizer_x_train, Count_vectorizer_x_test, Count_vectorizer_y_train, Count_vectorizer_y_test = train_test_split(count_vectorizer_df,target_thres,test_size = 0.2,random_state =42)

Count_vectorizer_x_train.shape

(39791, 4406)

In [96]:
y_true = Count_vectorizer_y_test.apply(lambda x: names.get(x))
y_true

11296        ROSS
12037    CHANDLER
29567        JOEY
13370    CHANDLER
48113      PHOEBE
           ...   
42769        JOEY
44847      MONICA
48731      Others
37859        JOEY
21244      RACHEL
Name: target, Length: 9948, dtype: object

In [97]:
loaded_model = pickle.load(open('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/CV-Frnds/cv-lr-frnds.sav', 'rb'))

In [98]:
y_pred = pd.Series(loaded_model.predict(Count_vectorizer_x_test)).apply(lambda x: names.get(x))
y_pred

0         MONICA
1       CHANDLER
2       CHANDLER
3           JOEY
4       CHANDLER
          ...   
9943        JOEY
9944      RACHEL
9945      Others
9946        JOEY
9947      RACHEL
Length: 9948, dtype: object

In [99]:
frnd_cv_lr_test = pd.DataFrame({'label': y_true.values, 'predict_label': y_pred.values})
frnd_cv_lr_test

Unnamed: 0,label,predict_label
0,ROSS,MONICA
1,CHANDLER,CHANDLER
2,JOEY,CHANDLER
3,CHANDLER,JOEY
4,PHOEBE,CHANDLER
...,...,...
9943,JOEY,JOEY
9944,MONICA,RACHEL
9945,Others,Others
9946,JOEY,JOEY


In [100]:
precision_recall_fscore_support(y_true, y_pred, average='weighted')

(0.24654287594217836, 0.24708484117410534, 0.24530826700194694, None)

In [101]:
precision_recall_fscore_support(y_true, y_pred, labels=frnd_names_keys, average=None)

(array([0.21139706, 0.27127319, 0.24343832, 0.25989305, 0.27714286,
        0.2615597 , 0.19269406]),
 array([0.25555556, 0.30285714, 0.25962211, 0.18881119, 0.29919803,
        0.24967062, 0.15711095]),
 array([0.23138833, 0.28619642, 0.2512699 , 0.21872187, 0.28774844,
        0.25547691, 0.1730927 ]),
 array([1350, 1400, 1429, 1287, 1621, 1518, 1343]))

In [102]:
loaded_model = pickle.load(open('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/CV-Frnds/cv-nb-frnds.sav', 'rb'))

In [103]:
y_pred = pd.Series(loaded_model.predict(Count_vectorizer_x_test)).apply(lambda x: names.get(x))
y_pred

0         RACHEL
1       CHANDLER
2           ROSS
3           JOEY
4       CHANDLER
          ...   
9943        JOEY
9944      RACHEL
9945    CHANDLER
9946        ROSS
9947      RACHEL
Length: 9948, dtype: object

In [104]:
frnd_cv_nb_test = pd.DataFrame({'label': y_true.values, 'predict_label': y_pred.values})
frnd_cv_nb_test

Unnamed: 0,label,predict_label
0,ROSS,RACHEL
1,CHANDLER,CHANDLER
2,JOEY,ROSS
3,CHANDLER,JOEY
4,PHOEBE,CHANDLER
...,...,...
9943,JOEY,JOEY
9944,MONICA,RACHEL
9945,Others,CHANDLER
9946,JOEY,ROSS


In [105]:
frnd_tfidf_svc_test = pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/Baseline Models/Baseline_Results/FRNDS_SVM_TFIDF.csv')
frnd_tfidf_svc_test

Unnamed: 0.1,Unnamed: 0,dialogue,label,predict_label
0,0,none less,PHOEBE,RACHEL
1,1,hey guy know new year's gee wrong new year's,RACHEL,CHANDLER
2,2,that’s great hey excellent,Others,Others
3,3,happen,MONICA,CHANDLER
4,4,we’re gonna paint sword replace baguette,PHOEBE,MONICA
...,...,...,...,...
3995,3995,uh rach come dental floss hair,MONICA,ROSS
3996,3996,it’s beautiful it’s like first bathroom floor ...,MONICA,JOEY
3997,3997,we’re,ROSS,MONICA
3998,3998,um refer bobo sperm guy,Others,MONICA


In [106]:
precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, average='weighted')

(0.24666673602048902, 0.24525, 0.24073038407922745, None)

In [107]:
def print_none_prf1(trad_test):
  res = precision_recall_fscore_support(trad_test.label, trad_test.predict_label, labels=list(names_rev.keys()), average=None)
  class_count = len(res[0])
  print('class  precision  recall  f1_score')
  for i in range(class_count):
    print(f'{names[i]}  {res[0][i]}  {res[1][i]}  {res[2][i]}')

In [108]:
print_none_prf1(frnd_tfidf_svc_test)

class  precision  recall  f1_score
MONICA  0.22617124394184168  0.25594149908592323  0.24013722126929676
JOEY  0.24848484848484848  0.15073529411764705  0.18764302059496568
CHANDLER  0.23510466988727857  0.2584070796460177  0.24620573355817876
PHOEBE  0.2826855123674912  0.16427104722792607  0.20779220779220778
RACHEL  0.271513353115727  0.305  0.2872841444270016
ROSS  0.2520045819014891  0.35313001605136435  0.2941176470588235
Others  0.21666666666666667  0.20504731861198738  0.2106969205834684


In [109]:
prf1_df = pd.DataFrame(
    {
     'model': [models[0] for _ in range(6)] + [models[1] for _ in range(6)] + [models[2] for _ in range(6)] + [models[3] for _ in range(6)] + [models[0] for _ in range(7)] + [models[1] for _ in range(7)] + [models[2] for _ in range(7)] + [models[3] for _ in range(7)],
     'data': ['BBT'] + bbt_names_keys[:-1] + ['BBT'] + bbt_names_keys[:-1] + ['BBT'] + bbt_names_keys[:-1] + ['BBT'] + bbt_names_keys[:-1] + ['Friends'] + frnd_names_keys[:-1] + ['Friends'] + frnd_names_keys[:-1] + ['Friends'] + frnd_names_keys[:-1] + ['Friends'] + frnd_names_keys[:-1],
     'precision': [precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, average='weighted')[0]] + list(precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, labels=bbt_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, labels=bbt_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, labels=bbt_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, labels=bbt_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, average='weighted')[0]] + list(precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, labels=frnd_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, labels=frnd_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, labels=frnd_names_keys, average=None)[0][:-1]) + [precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, average='weighted')[0]] + list(precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, labels=frnd_names_keys, average=None)[0][:-1]),
     'recall':    [precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, average='weighted')[1]] + list(precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, labels=bbt_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, labels=bbt_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, labels=bbt_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, labels=bbt_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, average='weighted')[1]] + list(precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, labels=frnd_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, labels=frnd_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, labels=frnd_names_keys, average=None)[1][:-1]) + [precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, average='weighted')[1]] + list(precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, labels=frnd_names_keys, average=None)[1][:-1]),
     'f1':        [precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, average='weighted')[2]] + list(precision_recall_fscore_support(bbt_bert_test.labels, bbt_bert_test.predicted_label, labels=bbt_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(bbt_tfidf_lr_test.label, bbt_tfidf_lr_test.predict_label, labels=bbt_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(bbt_tfidf_nb_test.label, bbt_tfidf_nb_test.predict_label, labels=bbt_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(bbt_tfidf_svm_test.label, bbt_tfidf_svm_test.predict_label, labels=bbt_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, average='weighted')[2]] + list(precision_recall_fscore_support(frnd_bert_test.labels, frnd_bert_test.predicted_label, labels=frnd_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(frnd_cv_lr_test.label, frnd_cv_lr_test.predict_label, labels=frnd_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(frnd_cv_nb_test.label, frnd_cv_nb_test.predict_label, labels=frnd_names_keys, average=None)[2][:-1]) + [precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, average='weighted')[2]] + list(precision_recall_fscore_support(frnd_tfidf_svc_test.label, frnd_tfidf_svc_test.predict_label, labels=frnd_names_keys, average=None)[2][:-1])
    }
)
prf1_df

Unnamed: 0,model,data,precision,recall,f1
0,DistilBERT,BBT,0.374783,0.385232,0.351152
1,DistilBERT,SHELDON,0.47925,0.740773,0.581981
2,DistilBERT,LEONARD,0.308039,0.429319,0.358705
3,DistilBERT,PENNY,0.358491,0.358974,0.358732
4,DistilBERT,HOWARD,0.306931,0.21831,0.255144
5,DistilBERT,RAJ,0.304569,0.131868,0.184049
6,Logistic Regression,BBT,0.357502,0.373333,0.336702
7,Logistic Regression,SHELDON,0.438036,0.720051,0.544706
8,Logistic Regression,LEONARD,0.302077,0.412577,0.348784
9,Logistic Regression,PENNY,0.379108,0.334023,0.35514


In [110]:
prf1_df.set_index(['model', 'data'])

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1
model,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DistilBERT,BBT,0.374783,0.385232,0.351152
DistilBERT,SHELDON,0.47925,0.740773,0.581981
DistilBERT,LEONARD,0.308039,0.429319,0.358705
DistilBERT,PENNY,0.358491,0.358974,0.358732
DistilBERT,HOWARD,0.306931,0.21831,0.255144
DistilBERT,RAJ,0.304569,0.131868,0.184049
Logistic Regression,BBT,0.357502,0.373333,0.336702
Logistic Regression,SHELDON,0.438036,0.720051,0.544706
Logistic Regression,LEONARD,0.302077,0.412577,0.348784
Logistic Regression,PENNY,0.379108,0.334023,0.35514


In [111]:
prf1_df.set_index(['model', 'data']).to_csv('precision_recall_f1_scores.csv')

In [112]:
bbt_bert = pd.concat(
    [
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Big Bang Theory/train_BBT_Hidden.csv'), 
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Big Bang Theory/val_BBT_Hidden.csv'),
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Big Bang Theory/test_BBT_Hidden.csv')
    ]
        )
bbt_bert

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
0,0,How’s it going?,HOWARD,PENNY
1,1,"Yum. Well, enjoy your big evening.",PENNY,LEONARD
2,2,"You’re sweet. Good night, Dr. Koothrappali.",OTHERS,SHELDON
3,3,"But once you open the box, you’ve voided the w...",SHELDON,OTHERS
4,4,"All right, so it sounds like we need a way to ...",LEONARD,LEONARD
...,...,...,...,...
4613,4613,What?,LEONARD,LEONARD
4614,4614,Okay. How about that one.,RAJ,LEONARD
4615,4615,Hey.,OTHERS,LEONARD
4616,4616,"Okay, okay. Guys, what are we doing here?",LEONARD,PENNY


In [113]:
bbt_bert[bbt_bert.text.str.contains('bazinga', flags=re.I)]

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
3069,3069,Bazinga! None of you ever see my practical jok...,SHELDON,LEONARD
4671,4671,"Agreed. Amy, I find myself wondering if we sho...",SHELDON,LEONARD
6104,6104,Bazinga.,SHELDON,HOWARD
11678,11678,Sorry. I was waiting for the bazinga.,PENNY,SHELDON
12183,12183,"Thanks, shorty,I’ll take it from here. All rig...",SHELDON,LEONARD
13019,13019,"A, I rarely kid. And B, when I do kid, you wil...",SHELDON,RAJ
15130,15130,"That’s a bazinga, right?",HOWARD,SHELDON
17384,17384,"Obviously, waitressing at the Cheesecake Facto...",SHELDON,SHELDON
20530,20530,Not to worry. I hid it. Bazinga! You’re in my ...,SHELDON,HOWARD
20886,20886,Of course not. Even in my sleep-deprived state...,SHELDON,PENNY


In [114]:
accuracy_score(bbt_bert[bbt_bert.text.str.contains('bazinga', flags=re.I)].labels, bbt_bert[bbt_bert.text.str.contains('bazinga', flags=re.I)].predicted_label)

0.3888888888888889

In [115]:
bbt_bert[bbt_bert.text.str.contains('Sweetie', flags=re.I, regex=False)]

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
893,893,"Sweetie, every night you don’t kill him in his...",PENNY,LEONARD
1123,1123,"Sweetie, I’m so sorry. I wish there was someth...",PENNY,HOWARD
1140,1140,"Sweetie, you are weird. Everyone knows you’re ...",PENNY,SHELDON
1915,1915,"Sweetie, I can buy my own stuff. I have a good...",PENNY,PENNY
3093,3093,"Sweetie, put the pants on.",PENNY,PENNY
9601,9601,"Sweetie, you can be any pastry you want.",PENNY,LEONARD
12594,12594,"Sweetie, did you have a bad dream?",PENNY,PENNY
12798,12798,"Sweetie, can I just be the girl tonight?",PENNY,HOWARD
20977,20977,"Sweetie, we don’t have to do this now.",HOWARD,PENNY
27805,27805,"Sweetie, once you stop paying rent, none of th...",PENNY,LEONARD


In [116]:
accuracy_score(bbt_bert[bbt_bert.text.str.contains('Sweetie', flags=re.I)].labels, bbt_bert[bbt_bert.text.str.contains('Sweetie', flags=re.I)].predicted_label)

0.43333333333333335

In [117]:
bbt_bert[bbt_bert.text.str.contains('rock.*paper.*scissor.*lizard', flags=re.I, regex=True)]

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
650,650,Then I believe we’ve arrived at another quinte...,SHELDON,RAJ
14534,14534,How about you decide this with Rock-Paper-Scis...,RAJ,RAJ
29066,29066,"Ooh, I don’t think so. No, anecdotal evidence ...",SHELDON,SHELDON
2607,2607,Rock-Paper-Scissors-Lizard-Spock was created b...,SHELDON,SHELDON
4057,4057,Rock-Paper-Scissors-Lizard-Spock?,HOWARD,SHELDON


In [118]:
accuracy_score(bbt_bert[bbt_bert.text.str.contains('rock.*paper.*scissor.*lizard', flags=re.I, regex=True)].labels, bbt_bert[bbt_bert.text.str.contains('rock.*paper.*scissor.*lizard', flags=re.I, regex=True)].predicted_label)

0.6

In [119]:
frnd_bert = pd.concat(
    [
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Friends Data/train_Friends_Hidden.csv'),
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Friends Data/val_Friends_Hidden.csv'),
        pd.read_csv('/content/drive/MyDrive/CSCI 544/CSCI 544 Project/ML Models/BERT/model data/Friends Data/test_Friends_Hidden.csv'),
    ]
)
frnd_bert

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
0,0,A couple hours.,PHOEBE,PHOEBE
1,1,Hello,RACHEL,JOEY
2,2,Harder than it sounds. Isn't it?,MONICA,CHANDLER
3,3,"Please, we're trying to have a conversation.",JOEY,MONICA
4,4,"Yes! Yes! A thousand times, yes!",CHANDLER,ROSS
...,...,...,...,...
5120,5120,"Hey, cut him some slack. It was Chandler's idea.",JOEY,JOEY
5121,5121,Ooh! That’s good! Wow! But now if you were pre...,PHOEBE,PHOEBE
5122,5122,"Yeah, you have to go fight for her!",MONICA,JOEY
5123,5123,Oh I a lot of stuff!,ROSS,JOEY


In [120]:
frnd_bert[frnd_bert.text.str.contains(r'could.*be.*more', flags=re.I, regex=True)]

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
5168,5168,"Oh hey, Ross. Umm, see, I was thinking maybe y...",Others,PHOEBE
5458,5458,Couldn't be more out.,CHANDLER,CHANDLER
8939,8939,Oh I was just doing Chandler's side of the con...,MONICA,JOEY
11281,11281,Could I be more sorry.,CHANDLER,RACHEL
14447,14447,Could we be more white trash?,CHANDLER,ROSS
17079,17079,"You’re right. You’re right, he’s just embracin...",PHOEBE,PHOEBE
22801,22801,"Uh, well yeah-yeah, I've got all of that going...",JOEY,ROSS
25402,25402,Look at me! I'm Chandler! Could I be wearing a...,JOEY,JOEY
28946,28946,"Ah no. I don’t, but it could not have been mor...",ROSS,PHOEBE
29432,29432,Could there be more Kims?,CHANDLER,CHANDLER


In [121]:
accuracy_score(frnd_bert[frnd_bert.text.str.contains(r'could.*be.*more', flags=re.I, regex=True)].labels, frnd_bert[frnd_bert.text.str.contains(r'could.*be.*more', flags=re.I, regex=True)].predicted_label)

0.35714285714285715

In [122]:
frnd_bert[frnd_bert.text.str.contains('how you doin', flags=re.I, regex=False)]

Unnamed: 0.1,Unnamed: 0,text,labels,predicted_label
6989,6989,So how you doing today? Did you sleep okay? Ta...,MONICA,RACHEL
10931,10931,"Hey, how you doin’?",JOEY,JOEY
18938,18938,"Hey, how you doin’?",JOEY,MONICA
22754,22754,"Oh-oh-oh-oh, how I do it is, I look a woman up...",JOEY,CHANDLER
24887,24887,"Cassie, how you-how you doin’ on that…hot dog.",ROSS,JOEY
26154,26154,"Yeah! I'm fine! Thanks! Hey Rach, how you doin'?",JOEY,RACHEL
29072,29072,Hey Phoebe... how you doin'? You feelin' better?,MONICA,JOEY
35433,35433,"Hey, Rach, how you doing with The Shining?",JOEY,Others
2938,2938,"Oh-oh-oh-oh, how I do it is, I look a woman up...",JOEY,JOEY
9243,9243,"So uh, how you doin’?",JOEY,JOEY


In [123]:
accuracy_score(frnd_bert[frnd_bert.text.str.contains('how you doin', flags=re.I, regex=False)].labels, frnd_bert[frnd_bert.text.str.contains('how you doin', flags=re.I, regex=False)].predicted_label)

0.4166666666666667