In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


def get_lines_by_character(character_name, data):

    lines_spoken = data[data['name'] == character_name]['line']
    return lines_spoken

data = pd.read_csv('https://raw.githubusercontent.com/nickkatsy/python_ml_ect_/master/arbiter.csv')

data['line'] = data['line'].str.lower()
data['line'] = data['line'].str.replace('[^\w\s]', '')

X = data['line']
y = data['name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier()
from sklearn.svm import SVC

GBC = GradientBoostingClassifier()
MNM = MultinomialNB()
svc = SVC(probability=True)

def evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, model):
    model = model.fit(X_train_tfidf, y_train)
    pred = model.predict(X_test_tfidf)
    clf_report = classification_report(y_test, pred)
    acc = accuracy_score(y_test, pred)
    print(f'{model.__class__.__name__}, --Classification Report--\n{clf_report}')
    print(f'{model.__class__.__name__}, --Accuracy-- {acc*100:.2f}%')
    return pred

GBC_pred = evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, GBC)
MNM_pred = evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, MNM)
svc_pred = evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, svc)


GradientBoostingClassifier, --Classification Report--
                        precision    recall  f1-score   support

              Arbiter        0.83      1.00      0.91        10
              Cortana        1.00      1.00      1.00         1
           Counselman        1.00      1.00      1.00         1
            Gravemind        1.00      1.00      1.00         1
        Miranda Keyes        1.00      1.00      1.00         2
     Prophet of Mercy        1.00      0.50      0.67         4
    Prophet of Regret        1.00      0.33      0.50         3
     Prophet of Truth        0.67      1.00      0.80         4
       SpecOps Elites        0.50      1.00      0.67         2
       SpecOps Leader        1.00      0.78      0.88         9
             Tartarus        1.00      1.00      1.00         5

              accuracy                           0.86        42
             macro avg       0.91      0.87      0.86        42
          weighted avg       0.90      0.86     

In [2]:
unique_characters = data['name'].unique()
for character in unique_characters:
    lines_spoken = get_lines_by_character(character, data)
    print(f"Lines spoken by {character}:")
    print(lines_spoken)

Lines spoken by     Cortana :
0       after seeing gravemind  what is that
104     after seeing gravemind  what is that
Name: line, dtype: object
Lines spoken by     Gravemind :
1                     i i am a monument to all your sins
4       this one is machine and nerve and has its min...
6       there is much talk and i have listened throug...
10      this ones containment and this ones great jou...
13      if you will not hear the truth then i will sh...
14                       you will search one likely spot
15      and you will search another fate may have pla...
105                   i i am a monument to all your sins
108     this one is machine and nerve and has its min...
110     there is much talk and i have listened throug...
114     this ones containment and this ones great jou...
117     if you will not hear the truth then i will sh...
118                      you will search one likely spot
119     and you will search another fate may have pla...
Name: line, dtype: objec