# `spaCy`

In this notebook, we explore using the python library `spacy` to enhance our analysis.

We ran into implementation issues, so there is no useful modelling in this notebook. Feel free to skip forward to the next.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer,\
                                            TfidfVectorizer


from sklearn.base import TransformerMixin

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [4]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\data\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

# Data Import

In [6]:
X_theonion = load_obj(DIR+FILE1+"_df_clean")
X_nottheonion = load_obj(DIR+FILE2+"_df_clean")

In [7]:
X_theonion["is_onion"] = 1
X_nottheonion["is_onion"] = -1

# Tokenizing with `spacy`

We follow along with the tutorial found here
https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

In [8]:
import spacy

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm, en_core_web_lg

import string
puntuations = string.punctuation


In [9]:
nlp_sm = en_core_web_sm.load()
nlp_lg = en_core_web_lg.load()

The following code, which merges tokens that are part of the same spacy `entity`, comes from https://stackoverflow.com/questions/54640715/tokenizing-named-entities-in-spacy

In [10]:
class EntityRetokenizeComponent:
  def __init__(self, nlp):
    pass
  def __call__(self, doc):
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": str(doc[ent.start:ent.end])})
    return doc

retokenizer = EntityRetokenizeComponent(nlp_lg) 
nlp_lg.add_pipe(retokenizer, name='merge_phrases', last=True)

# doc = nlp_lg("German Chancellor Angela Merkel and US President Barack Obama "
#           "converse in the Oval Office inside the White House in Washington, D.C.")

# [tok for tok in doc]

The following code, which implements some lemmatization, comes from https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

In [14]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(sentence):
    mytokens = nlp_lg(sentence)
    
    #lemmatize
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    
    #prune stopwords and punctuation
    mytokens = [word for word in mytokens if word not in stop_words and word not in puntuations]


class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
def clean_text(text):
    return text.strip().lower()

Now in the future we can call our methods with `tokenizer = spacy_tokenizer`, and add a `("cleaner", predictors())` preprocessing step to our pipeline.

In [27]:
X_theonion["title"].head(1)

id
79mcv    Microsoft Ad Campaign Crashing Nations Televis...
Name: title, dtype: object

# Taking spacy for a spin
Now that we have a preprocessing workflow, let's see if it improves anything.

## Generating our samples and holdout

In [20]:
N=4000
X_theonion_shuffled = X_theonion.sample(len(X_theonion))
theonion_sample = X_theonion_shuffled.head(N)
theonion_holdout = X_theonion_shuffled.tail(len(X_theonion_shuffled) - N)

X_nottheonion_shuffled = X_nottheonion.sample(len(X_nottheonion))
nottheonion_sample = X_nottheonion_shuffled.head(N)
nottheonion_holdout = X_nottheonion_shuffled.tail(len(X_nottheonion_shuffled)-N)
X_sample = pd.concat([theonion_sample, nottheonion_sample])
X = pd.concat([X_theonion, X_nottheonion])

# TF-IDF + Logistic Regression

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_sample["title"], X_sample["is_onion"])
pipe = Pipeline([
    ("cleaner", predictors()),
#     ("tfidf", TfidfVectorizer(tokenizer=spacy_tokenizer)),
#     ("logreg", LogisticRegressionCV(Cs=np.logspace(-2,2,100), max_iter=1000))
])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cleaner',
                 <__main__.predictors object at 0x0000025600C71B88>)],
         verbose=False)

In [33]:
pipe.transform(X_train)

['swedish government launches major study into sex lives of its citizens',
 'parent of middle school students amid shooting: ‘this happens in high school, not here’',
 'parents trying to gauge if son complete idiot before deciding whether to move to better school district',
 'the last of us 2 to be released in 2019',
 'pregnant woman kicked out of flight after plane deemed too heavy',
 'deal alert: your parents have promised to buy you lets go pikachu if you can make it through rosh hashanah this year without biting anyone',
 'tim duncan proudly reflects on all the degrees he accumulated during 19-year nba career',
 "burger king 'sorry' for offering burgers to women who get pregnant to world cup players",
 'female friend group fails in one duty of providing good gynecologist recommendation',
 'gold coast police hunting mullet man with face tatts over alleged sex toy theft',
 'feds say chinese ownership of grindr is a ‘national security risk’',
 'man accused of burgling home, painting d

In [None]:
display(pipe.score(X_train.map(lambda x: nlp_lg(x)), y_train))
display(pipe.score(X_test.map(lambda x: nlp_lg(x)), y_test))
display(pipe.score(theonion_holdout["title"].map(lambda x: nlp_lg(x)), theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"].map(lambda x: nlp_lg(x)), nottheonion_holdout["is_onion"]))