# SVM
SVMs are supposed to be realtively well-performing on high-dimensional data. We'll fit a few and see how we feel.

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [16]:
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB
# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer,\
                                            TfidfVectorizer
from sklearn.svm import SVC

In [17]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [18]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [19]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\data\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

# Data Import

In [6]:
X_theonion = load_obj(DIR+FILE1+"_df_clean")
X_nottheonion = load_obj(DIR+FILE2+"_df_clean")

In [7]:
X_theonion["is_onion"] = 1
X_nottheonion["is_onion"] = -1

## Generating our samples and holdout

In [8]:
N=4000
X_theonion_shuffled = X_theonion.sample(len(X_theonion))
theonion_sample = X_theonion_shuffled.head(N)
theonion_holdout = X_theonion_shuffled.tail(len(X_theonion_shuffled) - N)

X_nottheonion_shuffled = X_nottheonion.sample(len(X_nottheonion))
nottheonion_sample = X_nottheonion_shuffled.head(N)
nottheonion_holdout = X_nottheonion_shuffled.tail(len(X_nottheonion_shuffled)-N)
X_sample = pd.concat([theonion_sample, nottheonion_sample])
X = pd.concat([X_theonion, X_nottheonion])

## Preparing our stop-words list

In [11]:
N = 250

In [12]:
theonion_top_words = cvec_df.sum().sort_values(ascending=False).index[:N]

In [13]:
nottheonion_top_words = cvec2_df.sum().sort_values(ascending=False).index[:N]

# TF-IDF + SVC

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_sample["title"], X_sample["is_onion"])
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", SVC())])
     
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', deg

In [10]:
display(pipe.score(X_train, y_train))
display(pipe.score(X_test, y_test))
display(pipe.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(pipe.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9968333333333333

0.7965

0.8040540540540541

0.8117335093110857

In [12]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(strip_accents="unicode")),
#     ("norm", Normalizer()),
    ("svc", SVC())
])

rand_search = RandomizedSearchCV(pipe,
                                 n_jobs=-2,
                                 n_iter=50,
                                 param_distributions={
#     "logreg__C": np.logspace(-1,1,100),
    "tfidf__max_df": [.99, 1],
    "tfidf__min_df": [0, 0.01],
#     "tfidf__stop_words": [None, [i for i in theonion_top_words if i in nottheonion_top_words]],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [None, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000]
})
rand_search.fit(X_train, y_train);

In [13]:
rand_search.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.99,
                                 max_features=20000, min_df=0,
                                 ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0

In [14]:
display(rand_search.score(X_train, y_train))
display(rand_search.score(X_test, y_test))
display(rand_search.score(theonion_holdout["title"], theonion_holdout["is_onion"]))
display(rand_search.score(nottheonion_holdout["title"], nottheonion_holdout["is_onion"]))

0.9965

0.796

0.8038360941586749

0.8118322559512914

Lord would you look at those execution times! 5 minutes, on average. Presumably if I reduced my feature space SVM would be happier.