In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style, rcParams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import xgboost.sklearn as xgb
import xgboost as x
import gensim
import seaborn as sns
import pickle
import nlp_pipeline as nlp

style.use('ggplot')
rcParams['font.size'] = 14
rcParams['legend.fontsize'] = 'small'
rcParams['figure.titlesize'] = 'large'

In [2]:
df = pd.read_csv('../data/train.csv')
df_5class = df[df['Labels'] != 4].copy()
y = df_5class['Labels']

In [3]:
corpus = df_5class['Text'].tolist()
bow = [nlp.extract_bow_from_raw_text(row) for row in corpus]
bow[:10]

[['say',
  'annies',
  'list',
  'political',
  'group',
  'support',
  'third-trimester',
  'abortion',
  'demand'],
 ['decline',
  'coal',
  'start',
  'started',
  'natural',
  'gas',
  'took',
  'started',
  'begin',
  'president',
  'george',
  'w.',
  'bush',
  'administration'],
 ['hillary',
  'clinton',
  'agrees',
  'john',
  'mccain',
  '``',
  'voting',
  'give',
  'george',
  'bush',
  'benefit',
  'doubt',
  'iran',
  "''"],
 ['health',
  'care',
  'reform',
  'legislation',
  'likely',
  'mandate',
  'free',
  'sex',
  'change',
  'surgery'],
 ['economic', 'turnaround', 'started', 'end', 'term'],
 ['chicago',
  'bear',
  'starting',
  'quarterback',
  'last',
  '10',
  'year',
  'total',
  'number',
  'tenured',
  'uw',
  'faculty',
  'fired',
  'last',
  'two',
  'decade'],
 ['jim', 'dunnam', 'lived', 'district', 'represents', 'year'],
 ["'m",
  'person',
  'stage',
  'worked',
  'actively',
  'last',
  'year',
  'passing',
  'along',
  'rus',
  'feingold',
  'toughest',

In [None]:
size = 100

model = gensim.models.Word2Vec(bow, min_count = 1,  
                              size = size, window = 5, seed=2) 

In [None]:
model.wv.vocab

In [None]:
model.wv['health']

In [None]:
corpus_vec = np.zeros((len(bow),size))

for i, row in enumerate(bow):
    row_vec = np.zeros(size)
    for word in row:
        row_vec += model.wv[word]
    
    corpus_vec[i] = row_vec

In [None]:
min(len(ele) for ele in bow), max(len(ele) for ele in bow) 

In [None]:
lst = [len(ele) for ele in bow]
np.median(lst)

In [None]:
corpus_vec


In [21]:
scores = []

bow_train, bow_test, y_train, y_test = train_test_split(bow, y,
                                                            test_size=0.2,
                                                            stratify=y,
                                                            random_state=42)
# sizes = range(300, 401, 10)
# for size in sizes:
#     vec_train, vec_test = nlp.word_embed(bow_train, bow_test, size)
#     rf = RandomForestClassifier().fit(vec_train, y_train)
#     scores.append(f1_score(y_test, rf.predict(vec_test), average='weighted'))

In [None]:
fig, ax = plt.subplots()

ax.plot(sizes, scores)
ax.set_xlabel('Sizes')
ax.set_ylabel('Test Score')
ax.set_title("RF scores by Vector Sizes")
plt.savefig('../images/wv_size_tuning2.png');

In [None]:
scores

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

vec_train, vec_test = nlp.word_embed(bow_train, bow_test, 50)


vec_train.shape
cosim = cosine_similarity(vec_train[1435].reshape(1,-1), vec_train)

np.min(cosim)



0.977076275982616

In [49]:
np.argmin(cosim)

393

In [50]:
print(bow_train[1435])
print(bow_train[np.argmin(cosim)])

['say', 'rob', 'portman', 'even', 'voted', 'allow', 'people', 'terrorism', 'watch', 'list', 'buy', 'gun']
['socialized', 'medicine']


In [51]:
vec_train.shape
cosim = cosine_similarity(vec_train[4781].reshape(1,-1), vec_train)

np.min(cosim)

print(bow_train[4781])
print(bow_train[np.argmin(cosim)])


['nearly', '180,000', 'illegal', 'immigrant', 'criminal', 'record', 'ordered', 'deported', 'country', 'tonight', 'roaming', 'free', 'threaten', 'peaceful', 'citizen']
['socialized', 'medicine']


In [52]:
np.min(cosim)

0.9782040707898372

In [53]:
cosim

array([[0.99966784, 0.999687  , 0.99989965, ..., 0.99685302, 0.99775535,
        0.99991449]])

In [56]:
np.argwhere(cosim < 0.99)

array([[   0,  393],
       [   0,  404],
       [   0, 6134]])

In [55]:
print(bow_train[393])
print(bow_train[404])
print(bow_train[6134])

['socialized', 'medicine']
['definition', 'taker', 'maker']
['torture']


In [24]:
len(bow_train), len(bow_test), len(y_train), len(y_test)

(7520, 1881, 7520, 1881)

In [57]:
vec_train

array([[-0.771783  ,  0.38459969,  0.45658033, ...,  0.39073921,
        -0.06967998, -0.801142  ],
       [-0.16903634,  0.08255738,  0.09142965, ...,  0.0862899 ,
        -0.02181181, -0.16743236],
       [-0.39466765,  0.1946841 ,  0.22455869, ...,  0.20796908,
        -0.04206179, -0.4047093 ],
       ...,
       [-0.62234215,  0.30225181,  0.38838601, ...,  0.2746676 ,
        -0.01648996, -0.64899872],
       [-0.71058397,  0.34122111,  0.4392433 , ...,  0.32818729,
        -0.02748924, -0.73743534],
       [-0.67034803,  0.33753395,  0.38848971, ...,  0.3595038 ,
        -0.07204736, -0.69507749]])

In [60]:
vec_train.shape
cosim = cosine_similarity(vec_train[393].reshape(1,-1), vec_train)

np.min(cosim)

print(bow_train[393])
print(bow_train[np.argmin(cosim)])


['socialized', 'medicine']
['definition', 'taker', 'maker']


In [61]:
np.min(cosim)

0.9648142369396984

In [62]:
cosim

array([[0.97667493, 0.97566719, 0.97781566, ..., 0.97197083, 0.97322621,
        0.97758678]])

In [63]:
vec_train[393]

array([-0.03144798,  0.01419266,  0.01512869, -0.00390199, -0.00676349,
        0.00603408, -0.02803763,  0.00202296,  0.00964426,  0.02018273,
        0.00612537, -0.03439556,  0.00569673,  0.00765855,  0.03869031,
       -0.02984392, -0.01036862,  0.02628409,  0.0440286 , -0.01699016,
       -0.03131832, -0.04742092, -0.03998781, -0.01508568, -0.01709605,
        0.01426086,  0.00622103,  0.00468528,  0.02227516,  0.00899867,
        0.02053362, -0.00054347, -0.00083126, -0.02419918, -0.00610747,
        0.00975832,  0.0085134 ,  0.0292564 , -0.0147164 , -0.00588328,
       -0.02473221,  0.02414072, -0.00642683, -0.03757386, -0.0114057 ,
       -0.0088074 , -0.01530516,  0.02356858, -0.00444644, -0.02766982])

In [64]:
vec_train[4781]

array([-0.40484088,  0.20406661,  0.23171003,  0.04395007, -0.02542637,
       -0.03115412, -0.26059326,  0.02403478,  0.11233664,  0.26499851,
        0.00263696, -0.47280949,  0.02473226,  0.04285309,  0.45731367,
       -0.29291791, -0.14167692,  0.29921088,  0.56122067, -0.26750691,
       -0.46821912, -0.61623138, -0.48988023, -0.11955904, -0.31794851,
        0.16421704,  0.12861977,  0.11427963,  0.29194579,  0.11079982,
        0.22657431, -0.12765786,  0.00989515, -0.26325847, -0.04754418,
        0.16013421,  0.10301451,  0.29159822, -0.11391065, -0.0085085 ,
       -0.45504891,  0.30443351, -0.02504027, -0.47038468, -0.14851224,
       -0.15607522, -0.19667819,  0.22099297, -0.04509797, -0.41201732])