In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [84]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer,\
                                            TfidfVectorizer

In [33]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [34]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [35]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\data\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

# Data Import
We previously cleaned our data, so lets get it:

In [36]:
X_theonion = load_obj(DIR+FILE1+"_df_clean")
X_nottheonion = load_obj(DIR+FILE2+"_df_clean")

In [37]:
X_theonion["is_onion"] = 1
X_nottheonion["is_onion"] = -1

In [41]:
len(X_theonion)

8588

In [42]:
len(X_nottheonion)

419204

## Subsample our data

In [88]:
N=3000
theonion_sample = X_theonion.sample(N)
nottheonion_sample = X_nottheonion.sample(N)
X = pd.concat([theonion_sample, nottheonion_sample])

In [89]:
cvec = CountVectorizer(ngram_range=(1,3))
cvec.fit(theonion_sample["title"])
cvec.transform(theonion_sample["title"])
cvec_df = pd.DataFrame(cvec.transform(theonion_sample["title"]).toarray(),
                      columns=cvec.get_feature_names())

In [90]:
cvec2 = CountVectorizer(ngram_range=(1,3))
cvec2.fit(nottheonion_sample["title"])
cvec2.transform(nottheonion_sample["title"])
cvec2_df = pd.DataFrame(cvec2.transform(nottheonion_sample["title"]).toarray(),
                      columns=cvec2.get_feature_names())

In [91]:
N = 250

In [92]:
theonion_top_words = cvec_df.sum().sort_values(ascending=False).index[:N]

In [93]:
with pd.option_context("display.max_rows", None):
    display(theonion_top_words)

Index(['to', 'of', 'in', 'for', 'the', 'on', 'new', 'with', 'man', 'trump',
       ...
       'trying', 'players', 'court', 'nfl', 'trying to', 'few', 'will be',
       'show', 'releases', 'in pictures week'],
      dtype='object', length=250)

In [94]:
nottheonion_top_words = cvec2_df.sum().sort_values(ascending=False).index[:N]

In [95]:
with pd.option_context("display.max_rows", None):
    display(nottheonion_top_words)

Index(['to', 'in', 'of', 'for', 'the', 'on', 'man', 'after', 'with', 'and',
       ...
       'officer', 'to stop', 'giant', 'sign', 'use', 'tells', 'bans', 'uk',
       'mcdonald', 'my'],
      dtype='object', length=250)

## Words that are unique to `/r/theonion`

In [96]:
[(cvec_df[i].sum(),i) for i in theonion_top_words if i not in nottheonion_top_words]

[(64, 'week'),
 (60, 'nation'),
 (58, 'life'),
 (56, 'americans'),
 (52, 'before'),
 (51, 'going'),
 (50, 'still'),
 (45, 'god'),
 (45, 'last'),
 (44, 'area'),
 (43, 'know'),
 (42, 'every'),
 (39, 'things'),
 (38, 'announces'),
 (38, 'through'),
 (37, 'getting'),
 (37, 'next'),
 (36, 'white house'),
 (36, 'going to'),
 (35, 'work'),
 (35, 'campaign'),
 (34, 'around'),
 (34, 'week of'),
 (34, '2018'),
 (33, 'where'),
 (32, 'really'),
 (32, 'to know'),
 (31, 'hes'),
 (31, 'cant'),
 (30, 'tips'),
 (30, 'tips for'),
 (30, 'other'),
 (30, 'reveals'),
 (29, 'do'),
 (29, 'self'),
 (28, 'way'),
 (28, 'finally'),
 (28, 'nations'),
 (27, 'to know about'),
 (27, 'know about'),
 (27, 'already'),
 (27, 'some'),
 (26, 'good'),
 (26, 'office'),
 (26, 'things to'),
 (26, 'big'),
 (26, 'national'),
 (26, 'see'),
 (25, 'any'),
 (25, 'children'),
 (25, 'unveils'),
 (25, 'enough'),
 (25, 'study finds'),
 (25, 'things to know'),
 (25, 'another'),
 (25, 'how to'),
 (24, 'area man'),
 (24, 'believe'),
 (24, 

## Words that are unique to `/r/nottheonion`

In [97]:
[(cvec2_df[i].sum(),i) for i in nottheonion_top_words if i not in theonion_top_words]

[(74, 'because'),
 (72, 'arrested'),
 (62, 'sex'),
 (55, 'donald'),
 (55, 'gets'),
 (52, 'donald trump'),
 (52, 'florida'),
 (43, 'dead'),
 (43, 'us'),
 (39, 'men'),
 (39, 'claims'),
 (37, 'stop'),
 (37, 'dog'),
 (36, 'charged'),
 (36, 'accused'),
 (36, 'gun'),
 (35, 'fire'),
 (33, 'video'),
 (33, 'arrested for'),
 (32, 'sues'),
 (32, 'found'),
 (31, 'news'),
 (31, 'using'),
 (30, 'porn'),
 (30, 'boy'),
 (30, 'two'),
 (29, 'accused of'),
 (29, 'north'),
 (29, 'face'),
 (28, 'teen'),
 (28, 'post'),
 (28, 'wife'),
 (28, 'baby'),
 (28, 'city'),
 (28, 'don'),
 (28, 'student'),
 (27, 'free'),
 (27, 'cops'),
 (27, 'wants'),
 (26, 'church'),
 (26, 'facebook'),
 (26, 'on the'),
 (26, 'russian'),
 (26, 'himself'),
 (26, 'should'),
 (25, 'head'),
 (25, 'named'),
 (25, 'shot'),
 (25, 'killed'),
 (24, 'russia'),
 (24, 'gay'),
 (24, 'why'),
 (24, 'dies'),
 (24, 'girl'),
 (24, 'murder'),
 (24, 'flight'),
 (24, 'kill'),
 (23, 'attack'),
 (23, 'calls'),
 (23, 'won'),
 (23, 'to the'),
 (23, 'california

## Words that are common to both

In [98]:
[(cvec2_df[i].sum(),i) for i in theonion_top_words if i in nottheonion_top_words]

[(1012, 'to'),
 (527, 'of'),
 (684, 'in'),
 (523, 'for'),
 (501, 'the'),
 (368, 'on'),
 (101, 'new'),
 (263, 'with'),
 (326, 'man'),
 (165, 'trump'),
 (201, 'by'),
 (268, 'after'),
 (178, 'from'),
 (54, 'about'),
 (245, 'and'),
 (179, 'at'),
 (123, 'he'),
 (102, 'be'),
 (81, 'up'),
 (119, 'that'),
 (28, 'report'),
 (40, 'just'),
 (143, 'as'),
 (82, 'out'),
 (144, 'it'),
 (37, 'all'),
 (79, 'into'),
 (124, 'who'),
 (92, 'not'),
 (56, 'will'),
 (30, 'how'),
 (63, 'has'),
 (66, 'you'),
 (24, 'time'),
 (47, 'have'),
 (35, 'day'),
 (30, 'one'),
 (66, 'year'),
 (58, 'this'),
 (152, 'woman'),
 (114, 'over'),
 (51, 'they'),
 (57, 'can'),
 (43, 'off'),
 (231, 'is'),
 (37, 'down'),
 (54, 'him'),
 (30, 'if'),
 (22, 'study'),
 (39, 'first'),
 (158, 'his'),
 (32, 'house'),
 (39, 'get'),
 (36, 'more'),
 (59, 'old'),
 (32, 'only'),
 (35, 'your'),
 (52, 'people'),
 (38, 'white'),
 (29, 'could'),
 (21, 'finds'),
 (42, 'like'),
 (35, 'back'),
 (50, 'no'),
 (53, 'their'),
 (24, 'what'),
 (138, 'says'),
 

# A Logistic Regression

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X["title"], X["is_onion"])

In [105]:
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("logreg", LogisticRegression())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8013333333333333

In [106]:
pipe = Pipeline([
    ("cvec", CountVectorizer(stop_words=[(cvec2_df[i].sum(),i) for i in theonion_top_words if i in nottheonion_top_words])),
    ("logreg", LogisticRegression())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8013333333333333

In [107]:
pipe = Pipeline([
    ("cvec", CountVectorizer(stop_words=None)),
    ("mnb", MultinomialNB())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.808

In [108]:
pipe = Pipeline([
    ("cvec", CountVectorizer(stop_words=[(cvec2_df[i].sum(),i) for i in theonion_top_words if i in nottheonion_top_words])),
    ("mnb", MultinomialNB())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.808

In [103]:
# logreg_list = [LogisticRegression(C=i) for i in np.logspace(-2, 2, 100)]

# for logreg in logreg_list:
#     logreg.fit(X_train, y_train)
#     break