In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import pandas as pd
import numpy as np
import re
import unicodedata
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Exercises

Do your work for this exercise in a file named model.

Take the work we did in the lessons further:

   - What other types of models (i.e. different classifcation algorithms) could you use?
   - How do the models compare when trained on term frequency data alone, instead of TF-IDF values?


In [2]:
df = pd.read_csv("tng.csv")

top_15_characters = df.character.value_counts().index[0:15]

top_15 = df[df.character.isin(top_15_characters)]
top_15

Unnamed: 0,episode_name,line,character
0,Encounter at Farpoint,Difficult? Simply solve the mystery of Farpoi...,DATA
1,Encounter at Farpoint,As simple as that.,PICARD
2,Encounter at Farpoint,Farpoint Station. Even the name sounds myster...,TROI
3,Encounter at Farpoint,"It's hardly simple, Data, to negotiate a frie...",PICARD
4,Encounter at Farpoint,Inquiry. The word snoop?,DATA
...,...,...,...
51983,All Good Things,Of course. Have a seat.,RIKER
51984,All Good Things,"Would you care to deal, sir?",DATA
51985,All Good Things,"Oh, er, thank you, Mister Data. Actually, I u...",PICARD
51986,All Good Things,You were always welcome.,TROI


In [3]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return " ".join([wnl.lemmatize(word) for word in words if word not in stopwords])

In [4]:
df.line = df.line.apply(clean)
df.head()

Unnamed: 0,episode_name,line,character
0,Encounter at Farpoint,difficult simply solve mystery farpoint station,DATA
1,Encounter at Farpoint,simple,PICARD
2,Encounter at Farpoint,farpoint station even name sound mysterious,TROI
3,Encounter at Farpoint,hardly simple data negotiate friendly agreemen...,PICARD
4,Encounter at Farpoint,inquiry word snoop,DATA


In [5]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [6]:
train, validate, test = split(top_15, 'character')

In [7]:
# Setup our X variables
X_train = train.line
X_validate = validate.line
X_test = test.line

In [8]:
# Setup our y variables
y_train = train.character
y_validate = validate.character
y_test = test.character

In [9]:
# Create the tfidf vectorizer object
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
# Now you have a vactorized dataset and its fit on the clasification model.
# Now you have a vactorized dataset and its fit on the clasification model.
lm = KNeighborsClassifier().fit(X_train_vectorized, y_train)

In [14]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [15]:
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [16]:
# Train Accuracy
(train.actual == train.predicted).mean()

0.34224770642201835

In [17]:
(validate.actual == validate.predicted).mean()

0.2837114726027397

In [18]:
# Now that we have a trained model, lets use our model to predict the charecter of any given line.
lines = pd.Series([
    "we have a responsibility", 
    "set phasers to stun", 
    "the warp drive is about to go critical", 
    "What does it mean to be human? I cannot calculate feelings", 
    "Romulan bird of prey decloaking off the port bow"
])

# apply clean 
lines = lines.apply(clean)

# We have to vectorize these inputs if we'regoing to be able to use the classification model.
lines = tfidf.transform(lines)
lines

<5x13346 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [19]:
lm.predict(lines)

array(['PICARD', 'PICARD', 'PICARD', 'PICARD', 'PICARD'], dtype=object)

In [28]:
from sklearn.metrics import classification_report
print(classification_report(train.actual, train.predicted))

              precision    recall  f1-score   support

    COMPUTER       0.64      0.23      0.34       272
     CRUSHER       0.34      0.13      0.19      1521
        DATA       0.46      0.14      0.22      3008
      GUINAN       0.38      0.10      0.16       234
     LAFORGE       0.49      0.14      0.22      2048
     LWAXANA       0.53      0.04      0.08       201
      PICARD       0.31      0.93      0.47      5984
     PULASKI       0.67      0.02      0.05       256
           Q       0.43      0.01      0.02       274
       RIKER       0.52      0.13      0.21      3421
          RO       0.50      0.08      0.13       194
       TASHA       0.61      0.04      0.08       247
        TROI       0.51      0.08      0.14      1612
      WESLEY       0.34      0.11      0.16       693
        WORF       0.62      0.12      0.20      1835

    accuracy                           0.34     21800
   macro avg       0.49      0.15      0.18     21800
weighted avg       0.44   

In [29]:
characters = train.actual.value_counts().index.tolist()

In [30]:
for character in characters:
    character_lines = train[train.actual == character]
    accuracy = (character_lines.actual == character_lines.predicted).mean()
    print(f'Predicting {character} has {round(accuracy, 2)}')

Predicting PICARD has 0.93
Predicting RIKER has 0.13
Predicting DATA has 0.14
Predicting LAFORGE has 0.14
Predicting WORF has 0.12
Predicting TROI has 0.08
Predicting CRUSHER has 0.13
Predicting WESLEY has 0.11
Predicting Q has 0.01
Predicting COMPUTER has 0.23
Predicting PULASKI has 0.02
Predicting TASHA has 0.04
Predicting GUINAN has 0.1
Predicting LWAXANA has 0.04
Predicting RO has 0.08


- KNearest Neibors is more accurate twords PICCARD than the LogisticRegression Model. This model overall is less accurate but is also less over fit.


In [95]:
df = pd.read_csv("tng.csv")

top_15_characters = df.character.value_counts().index[0:15]

top_15 = df[df.character.isin(top_15_characters)]
top_15

Unnamed: 0,episode_name,line,character
0,Encounter at Farpoint,Difficult? Simply solve the mystery of Farpoi...,DATA
1,Encounter at Farpoint,As simple as that.,PICARD
2,Encounter at Farpoint,Farpoint Station. Even the name sounds myster...,TROI
3,Encounter at Farpoint,"It's hardly simple, Data, to negotiate a frie...",PICARD
4,Encounter at Farpoint,Inquiry. The word snoop?,DATA
...,...,...,...
51983,All Good Things,Of course. Have a seat.,RIKER
51984,All Good Things,"Would you care to deal, sir?",DATA
51985,All Good Things,"Oh, er, thank you, Mister Data. Actually, I u...",PICARD
51986,All Good Things,You were always welcome.,TROI


In [96]:
df.line = df.line.apply(clean)
df.head()

Unnamed: 0,episode_name,line,character
0,Encounter at Farpoint,difficult simply solve mystery farpoint station,DATA
1,Encounter at Farpoint,simple,PICARD
2,Encounter at Farpoint,farpoint station even name sound mysterious,TROI
3,Encounter at Farpoint,hardly simple data negotiate friendly agreemen...,PICARD
4,Encounter at Farpoint,inquiry word snoop,DATA


In [97]:
train, validate, test = split(top_15, 'character')

In [98]:
def set_xy(df):
    
    # Setup our X variables
    X_train = train.line
    X_validate = validate.line
    X_test = test.line
    
    # Setup our y variables
    y_train = train.character
    y_validate = validate.character
    y_test = test.character
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [99]:
X_train, X_validate, X_test, y_train, y_validate, y_test = set_xy(df)

In [100]:
def Vectorize(X_train, X_validate, X_test):
    
    # Create the tfidf vectorizer object
    tfidf = TfidfVectorizer()

    # Fit on the training data
    tfidf.fit(X_train)

    # Use the object
    X_train_vectorized = tfidf.transform(X_train)
    X_validate_vectorized = tfidf.transform(X_validate)
    X_test_vectorized = tfidf.transform(X_test)
    
    return X_train_vectorized, X_validate_vectorized, X_test_vectorized

In [101]:
X_train_vectorized, X_validate_vectorized, X_test_vectorized = Vectorize(X_train, X_validate, X_test)

In [118]:
from sklearn.tree import DecisionTreeClassifier

In [119]:
def Model(X_train_vectorized, X_validate_vectorized, X_test_vectorized, y_train, y_validate, y_test):
    
    # Now you have a vactorized dataset and its fit on the clasification model.

    lm = DecisionTreeClassifier().fit(X_train_vectorized, y_train)

    train = pd.DataFrame(dict(actual=y_train))
    validate = pd.DataFrame(dict(actual=y_validate))
    test = pd.DataFrame(dict(actual=y_test))
    
    train['predicted'] = lm.predict(X_train_vectorized)
    validate["predicted"] = lm.predict(X_validate_vectorized)
    test['predicted'] = lm.predict(X_test_vectorized)
    
    # Train Accuracy
    aT = (train.actual == train.predicted).mean()
    
    aV = (validate.actual == validate.predicted).mean()
    
    return aT, aV

In [120]:
aT, aV = Model(X_train_vectorized, X_validate_vectorized, X_test_vectorized, y_train, y_validate, y_test)

In [121]:
aT, aV

(0.9518348623853211, 0.2736515410958904)