## Classifier comparison

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import re

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from spacy.en import English
import spacy

In [3]:
parser = English()
nlp = spacy.load('en')

In [4]:
data_text = pd.read_csv('data_text.csv')

In [5]:
# Build a list of stopwords, remove 'the', because I'm interested in specificity. 

STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
#STOPLIST.remove('the')
# Remove symbols that are not alpha-numeric, replace them with a space
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [6]:
# transformer that cleans text with spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [7]:
# function that cleans text:
def cleanText(text):
    # get rid of newlines, and non alpha-numeric characters
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = re.sub(r'([^\s\w]|_)+', ' ', text)
    text = re.sub(' +',' ',text)   



    #parse it
    parsed_text = nlp(text)

    # get rid of proper nouns
    proper = []
    token_isoov = [token.is_oov for token in parsed_text]
    token_text = [token.orth_ for token in parsed_text]
    token_pos = [token.pos_ for token in parsed_text]
    for pos, word, oov in zip(token_pos,token_text,token_isoov):
        if pos == 'PROPN' and oov==True:
            proper.append(str(word))
    for pro in proper:
        text = text.replace(pro,' PROPN ')


    #recode entities
    ents = {}
    for num,entity in enumerate(parsed_text.ents):
        ents[entity.label_] = entity.orth_
    for code, entity in ents.items():
        text = text.replace(entity,str(' '+code+' '))
        
    
    # lowercase
    text = text.lower()
    text = re.sub(' +',' ',text)  
    
    return text

In [8]:
# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

In [9]:
vectorizer = TfidfVectorizer(tokenizer=tokenizeText, ngram_range=(1,2))

In [10]:
train, test, labels_train, labels_test = train_test_split(data_text.X,data_text.Y, test_size=0.20, random_state=42)

In [11]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
          "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
#    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf_p = clf
    pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf_p)])
    pipe.fit(train, labels_train)
    preds_bow = pipe.predict(test)
    print("----------------------------------------------------------------------------------------------")
    print(name,accuracy_score(labels_test, preds_bow))

----------------------------------------------------------------------------------------------
Nearest Neighbors 0.528183716075
----------------------------------------------------------------------------------------------
Linear SVM 0.434237995825
----------------------------------------------------------------------------------------------
RBF SVM 0.615866388309
----------------------------------------------------------------------------------------------
Decision Tree 0.441196938065
----------------------------------------------------------------------------------------------
Random Forest 0.443980514962
----------------------------------------------------------------------------------------------
Neural Net 0.6040361865
----------------------------------------------------------------------------------------------
AdaBoost 0.523312456507


TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [26]:
C_2d_range = [i/10 for i in range(1,11)]
gamma_2d_range = [i for i in range(1,6)]
classifiers = []
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf_p = SVC(C=C, gamma=gamma)
        pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf_p)])
        pipe.fit(train, labels_train)
        preds_bow = pipe.predict(test)
        classifiers.append((C, gamma, accuracy_score(labels_test, preds_bow)))
        print("----------------------------------------------------------------------------------------------")
        print(C,gamma,accuracy_score(labels_test, preds_bow))

----------------------------------------------------------------------------------------------
0.1 1 0.459290187891
----------------------------------------------------------------------------------------------
0.1 2 0.453027139875
----------------------------------------------------------------------------------------------
0.1 3 0.443980514962
----------------------------------------------------------------------------------------------
0.1 4 0.442588726514
----------------------------------------------------------------------------------------------
0.1 5 0.439805149617
----------------------------------------------------------------------------------------------
0.2 1 0.494780793319
----------------------------------------------------------------------------------------------
0.2 2 0.480167014614
----------------------------------------------------------------------------------------------
0.2 3 0.473903966597
------------------------------------------------------------------------

In [42]:
max_ = 0
for (k, (C, gamma, score)) in enumerate(classifiers):
    if score>max_:
        max_ = score
        C_ = C
        Gamma = gamma
print(max_,C_,Gamma)

0.650661099513 1.3 1


In [43]:
C_2d_range = [1+i/10 for i in range(1,11)]
gamma_2d_range = [i/10 for i in range(1,11)]
classifiers = []
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf_p = SVC(C=C, gamma=gamma)
        pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf_p)])
        pipe.fit(train, labels_train)
        preds_bow = pipe.predict(test)
        classifiers.append((C, gamma, accuracy_score(labels_test, preds_bow)))
        print("----------------------------------------------------------------------------------------------")
        print(C,gamma,accuracy_score(labels_test, preds_bow))

----------------------------------------------------------------------------------------------
1.1 0.1 0.551844119694
----------------------------------------------------------------------------------------------
1.1 0.2 0.593597773138
----------------------------------------------------------------------------------------------
1.1 0.3 0.610299234516
----------------------------------------------------------------------------------------------
1.1 0.4 0.620737647878
----------------------------------------------------------------------------------------------
1.1 0.5 0.627000695894
----------------------------------------------------------------------------------------------
1.1 0.6 0.633263743911
----------------------------------------------------------------------------------------------
1.1 0.7 0.640222686152
----------------------------------------------------------------------------------------------
1.1 0.8 0.642310368824
--------------------------------------------------------