# ASOIAF/GoT Reddit Posts - Pt. 3

># Feature Engineering and Baseline Model 

### Import Libraries

In [1]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import spacy
from spacy.util import minibatch, compounding
from spacy.pipeline import TextCategorizer


%matplotlib inline

### Reload Cleaned Data

In [2]:
## loading our data
data = pd.read_csv('./data/unique_data.csv')

In [3]:
## showing first 5 rows
data.head()

Unnamed: 0,post_text,subreddit,parsed_post
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf,Welcome to the Weekly Q &amp; A! Feel free to ...
1,It's happened to all of us.\n\nYou come across...,asoiaf,It's happened to all of us.\n\nYou come across...
2,Something thats always bothered me is Tywin's ...,asoiaf,Something thats always bothered me is Tywin's ...
3,Apologies if this has been posted before.\n\nI...,asoiaf,Apologies if this has been posted before.\n\nI...
4,One of the things I was sorry not to get more ...,asoiaf,One of the things I was sorry not to get more ...


In [4]:
## binarizing our subreddit category
data['subreddit'] = data['subreddit'].map({'asoiaf':0,
                               'gameofthrones':1})
## showing success
data['subreddit'].value_counts()

0    939
1    603
Name: subreddit, dtype: int64

In [5]:
## importing our custom stop words
stop_list = pd.read_csv('./data/stop_list.csv')
stop_list.head()

Unnamed: 0,term
0,welcome
1,q
2,welcome to
3,the weekly
4,weekly q


In [6]:
## creating our stop word/phrase list
custom_stop = []
for word in range(len(stop_list['term'])):
    custom_stop.append(stop_list['term'][word])

In [7]:
## printing our first 5
custom_stop[:5]

['welcome', 'q', 'welcome to', 'the weekly', 'weekly q']

### Baseline Model

In [8]:
## setting our X and target(y) variables
X = data['parsed_post']
y = data['subreddit']

In [9]:
## showing our baseline percentage
y.value_counts(normalize=True)

0    0.608949
1    0.391051
Name: subreddit, dtype: float64

>## Building a Model with spaCy

In [10]:
## loading spaCy
nlp = spacy.load('en_core_web_sm')

In [11]:
## using a lambda function to turn all parsed_posts and subreddits into a Series containing tuples
data['tuples'] = data.apply(
    lambda row: (row['parsed_post'],row['subreddit']), axis=1)

## turning the tuples into a list we'll use for training
train = data['tuples'].tolist()

## showing our first tuple
train[0]

("Welcome to the Weekly Q &amp; A! Feel free to ask any questions you may have about the world of ASOIAF. No need to be bashful. Book and show questions are welcome; please say in your question if you would prefer to focus on the BOOKS, the SHOW, or BOTH.  And if you think you've got an answer to someone's question, feel free to lend them a hand!",
 0)

In [145]:
## SOURCE: functions mainly manufactured from spacy documentation

## creating a function that will load are training size and explicitly call our limit
def load_data(limit=0, train_size=0.8):
    
    train_data = train  ## setting our training data
    np.random.shuffle(train_data)  ## randomly shuffling our training data
    
    ## setting the limit based on function being passed
    ## with current code it will be set to n_texts value below
    train_data = train_data[-limit:]
    
    ## unzipping our tuple into two separate lists (texts and subreddit value/type)
    texts, bool_val = zip(*train_data)
    
    ## setting our subreddit values to True or False
    ## this will be used in the scoring portion of the 'evaluate' function
    cats = [{'Game of Thrones': bool(y)} for y in bool_val]
    split = int(len(train_data) * train_size)  ## splits our training data based on argument passed
    
    ## returns our text and category (True/False vals) test/train split
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])



## creating a function that will score our data after instantiating our optimizer
def evaluate(tokenizer, textcat, texts, cats):
    
    ## setting our docs for 
    docs = (tokenizer(text) for text in texts)
    
    ## setting empty values for our confusion matrix and scores
    tp = 1e-8 # True positives , 1e-8 is there to ensure we never divide by zero
    fp = 0 # False positives
    fn = 0  # False negatives
    tn = 0  # True negatives
    
    ## using our textcat.pipe on our tokenized docs to pull a number and it's associated value
    for i, doc in enumerate(textcat.pipe(docs)):
        
        gold = cats[i] ## setting our 'gold_parse' to the indexed dict value of {'GoT': T or F} 
        
        ## for_loop to extract boolean values of 'gold_parse' and doc 'score' from textcat.pipe
        for bool_val, score in doc.cats.items():
            
            ## a series of if_statements that will increase count on our confusion matrix
            ## when the each parameter for 'score' and 'bool_val' are met
            if bool_val not in gold:
                continue
            if score >= 0.5 and gold[bool_val] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[bool_val] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[bool_val] < 0.5:
                tn += 1
            elif score < 0.5 and gold[bool_val] >= 0.5:
                fn += 1
    
    ## calculate and return the following scores
    accuracy = (tp + tn) / (tp + tn + fp + fn)            
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    
    return {'textcat_p': precision, 
            'textcat_r': recall, 
            'textcat_f': f_score, 
            'textcat_a': accuracy}

## number of texts to train
n_texts=len(data)

## number of training iterations
n_iter=20

In [146]:
## this is the protocol for calling spaCy's TextCategorizer'textcat'
## Code adapted from spaCy's documentation to create or add the necessary pipeline
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

## adding label to text classifier, used for Boolean comparison
textcat.add_label('Game of Thrones')

## this creates our train/test split from the 'load_data' function
(train_texts, train_cats), (test_texts, test_cats) = load_data(limit=n_texts)  

## prints details of line above, samples size and split
print(f'Using {n_texts} examples ({len(train_texts)} training, {len(test_texts)} evaluation)')

## takes 'train_text' and 'bool_val' from load_data function
## this will get evaluated in the next block of code
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Using 1542 examples (1233 training, 309 evaluation)


In [147]:
## Some more code adapted from spaCy
## this block ensures that we are only calling and training one pipeline 'textcat'
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    
    ## spaCy trains a model with begin_training() using thinc.neural
    optimizer = nlp.begin_training()
    print("Training the model...")
    
    ## redo as an f-string // Header column
    print(f'{"LOSS":^8}{"P":^8}{"R":^8}{"F":^8}{"A":^8}')
    
    for i in range(n_iter):  ## computing our set number of iterations
        losses = {}  ## creating a dictionary to store our losses
        
        ## using spaCy's minibatching to create a smaller batches of our data 
        ## for gradient descent optimization
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        
        ## looping through the smaller batches
        for batch in batches:
            ## unzipping our batches and assigning the post text to 'texts' 
            ## and the 'cats' and 'bool_val' to 'annotations'
            texts, annotations = zip(*batch)
            
            ## updating our model by using our 'optimizer' for gradient descent
            ## setting a dropout rate of .2 to prevent overfitting our model
            ## reevaluating our losses
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        
          ## encoding the parameters of thinc.neural on 'textcat'
        with textcat.model.use_params(optimizer.averages):
            
            # evaluate on the dev data split off in load_data()
            ## evaluate is the function from above
            scores = evaluate(nlp.tokenizer, textcat, test_texts, test_cats)
          
        ## printing results (rounded, with centering and 8 spaces)
        print(f'{round(losses["textcat"],2):^8}'  
              f'{round(scores["textcat_p"],3):^8}' 
              f'{round(scores["textcat_r"],3):^8}'
              f'{round(scores["textcat_f"],3):^8}'
              f'{round(scores["textcat_a"],3):^8}'
             )

Training the model...
  LOSS     P       R       F       A    
 19.42   0.941   0.926   0.933   0.948  
 10.22   0.932   0.909   0.921   0.939  
  8.43   0.922   0.884   0.903   0.926  
  6.36   0.939   0.884   0.911   0.932  
  4.55   0.946   0.868   0.905   0.929  
  4.38    0.93   0.884   0.907   0.929  
  4.66   0.922   0.876   0.898   0.922  
  3.9    0.915   0.893   0.904   0.926  
  4.22   0.902   0.917    0.91   0.929  
  4.17   0.901   0.901   0.901   0.922  
  3.5     0.91   0.917   0.914   0.932  
  3.23   0.899   0.884   0.892   0.916  
  3.19   0.893   0.893   0.893   0.916  
  4.62   0.897   0.868   0.882   0.909  
  3.14    0.9    0.893   0.896   0.919  
  3.61   0.892   0.884   0.888   0.913  
  3.59   0.876   0.876   0.876   0.903  
  3.56   0.877   0.884   0.881   0.906  
  3.13   0.873   0.851   0.862   0.893  
  3.33   0.873   0.851   0.862   0.893  



>## Modeling via Sci-kit Learn

In [117]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn

In [125]:
## reminder of our baseline
y.value_counts(normalize=True)

0    0.608949
1    0.391051
Name: subreddit, dtype: float64

In [126]:
## creating our train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=42, 
                                                    stratify=y)

In [134]:
## creating a list of our two chosen vectorizers to iterate through in our grid search
vectorizer = [CountVectorizer(), TfidfVectorizer()]

## creating variables to accept tuning parameters
max_feat = [300, 500]  
ngram_range = [(1, 3), (1, 2)] 
stop_words = [None, 'english', custom_stop] 
max_df = [0.9, 0.8] 

## creating any empty results list to capture our cv_results_ at the end of each iteration
results = []

## looping through both vectorizers
for vect in vectorizer:
    
    ## creating a pipeline for our vectorizer and classifier models
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', LogisticRegression())
    ])
    
    ## generating our parameters for vectorizers
    vect_params = {'vect__max_features': max_feat,
                    'vect__stop_words': stop_words,
                    'vect__ngram_range': ngram_range,
                    'vect__max_df': max_df
                  }
    parameters = [
        {
            ## Logistic Regression
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (LogisticRegression(solver='liblinear'), ), ## setting our first classifier model
            'clf__penalty': ('l1', 'l2'), #2
            'clf__C': (.5, 1.0), #4 288*2
        }, 
        {
            ## Multinomial Bayes
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (MultinomialNB(), ),  ## setting our second classifier model
            'clf__alpha': (.5, 1.0)  #2 72*2
        },
        {
            ## SVC
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (SVC(gamma='scale', ), ),
            'clf__kernel': ('rbf', 'poly') 
        },
        {
            ## RandomForestClassifier
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (RandomForestClassifier(n_estimators=50, min_samples_split=5), ),
        },
        {
            ## putting together an ensemble model
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                 ('rf', RandomForestClassifier()), 
                                                 ('mnb', MultinomialNB()), 
                                                 ('svc', SVC())],                                           
                                            voting='hard'), )
        }
        
    ]
    
    ## performing our grid search with the inherited pipeline and parameters
    grid_search = GridSearchCV(pipeline, 
                               parameters,
                               cv=3,
                               n_jobs=-1,
                               verbose=1,
                               return_train_score=True
                              )
    
    ## running an if statement to print the type of vectorizer used
    if vect == vectorizer[0]:
        vect_string = "CountVectorizer"
    
    else:
        vect_string = "Tf-IDF Vectorizer"
    
    ## fitting our model and printing our best scores and parameters
    grid_search.fit(X_train, y_train)
    print(f'''Best score for {vect_string} is: 
    {round(grid_search.best_score_, 4)}
    ''')
    print(grid_search.best_params_)
    print("")
    
    ## appending our cv_results_ to the end of results
    results.append(grid_search.cv_results_)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.7min finished


Best score for CountVectorizer is: 
    0.7822
    
{'clf': MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True), 'clf__alpha': 0.5, 'vect__max_df': 0.9, 'vect__max_features': 500, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 13.4min finished


Best score for Tf-IDF Vectorizer is: 
    0.7674
    
{'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'vect__max_df': 0.9, 'vect__max_features': 500, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}



In [135]:
## turning our results into a CV
results_df = pd.DataFrame(grid_search.cv_results_)

In [142]:
## printing out our results
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_clf__penalty,param_vect__max_df,param_vect__max_features,param_vect__ngram_range,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,2.373203,0.078691,0.369766,0.041348,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 3)",...,0.669444,0.704735,0.689527,0.014818,215,0.719054,0.714882,0.729167,0.721034,0.005997
1,1.164617,0.076181,0.227521,0.027277,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 3)",...,0.708333,0.724234,0.719184,0.007685,158,0.734353,0.739917,0.761111,0.745127,0.011528
2,2.729589,0.108484,0.385576,0.066235,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 3)",...,0.669444,0.704735,0.690454,0.015179,214,0.720445,0.714882,0.727778,0.721035,0.005281
3,1.024234,0.063128,0.223400,0.020190,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 2)",...,0.669444,0.701950,0.688601,0.013897,217,0.719054,0.712100,0.729167,0.720107,0.007007
4,0.740609,0.031643,0.164100,0.033541,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 2)",...,0.708333,0.724234,0.719184,0.007685,158,0.734353,0.738526,0.762500,0.745126,0.012403
5,1.354643,0.232952,0.278203,0.049495,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,300,"(1, 2)",...,0.669444,0.704735,0.689527,0.014818,215,0.719054,0.713491,0.730556,0.721034,0.007106
6,3.469879,1.016552,1.125345,0.443995,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,500,"(1, 3)",...,0.666667,0.701950,0.683040,0.014508,231,0.712100,0.699583,0.719444,0.710376,0.008200
7,3.909729,0.736120,0.681134,0.532587,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,500,"(1, 3)",...,0.702778,0.718663,0.708990,0.006924,182,0.728790,0.720445,0.751389,0.733541,0.013072
8,3.704087,0.991593,0.755768,0.225352,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,500,"(1, 3)",...,0.666667,0.699164,0.683040,0.013266,231,0.710709,0.698192,0.719444,0.709449,0.008722
9,2.030254,0.490545,0.438581,0.083735,"LogisticRegression(C=1.0, class_weight=None, d...",0.5,l1,0.9,500,"(1, 2)",...,0.666667,0.699164,0.681186,0.013482,234,0.712100,0.699583,0.719444,0.710376,0.008200
