In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("a2a_train_final.tsv", sep="\t") 
#data = pd.read_csv("a2a_train_round1.tsv", sep="\t") 
data.columns = ["Class", "Comment"]

In [3]:
# the actual classification algorithm
from sklearn.svm import LinearSVC

# for converting training and test datasets into matrices
# TfidfVectorizer does this specifically for documents
from sklearn.feature_extraction.text import TfidfVectorizer

# for bundling the vectorizer and the classifier as a single "package"
from sklearn.pipeline import make_pipeline

# for splitting the dataset into training and test sets 
from sklearn.model_selection import train_test_split

# for evaluating the quality of the classifier
from sklearn.metrics import accuracy_score

In [4]:
data

Unnamed: 0,Class,Comment
0,0/0,Being a member of the European Union is a bit ...
1,0/0,Brexit is bad. Immigrants make Britain great. ...
2,0/0,Britain is basically Pompeii if the Pompeii ha...
3,1/1,Britain's exit is a huge blow to the dream of ...
4,1/-1,"Bye, Bye EU, Bye, Bye...Fireworks are going of..."
...,...,...
13511,1/1,‚ÄùFake news! UK will prosper as soon as Bojo si...
13512,0/1,‚Ä† I know exactly how it will end. ‚Ä†
13513,1/1,üá¨üáß Hard Brexit all the way üá¨üáß
13514,0/0,"üòÜ it's funny, when the brits sees the raise in..."


In [6]:
from collections import Counter
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

33781

# 1. Text data preprocessing

# 1.1 Drop inconsistent annotation 

Drop those comments with inconsistent annotation. 81.6% comments are filtered out after this step, containing 50.4% pro-brexit(1) and 49.6% anti-brexit(0).

In [138]:
#def find_majority(k):
#    myMap = {}
#    maximum = ( '', 0 ) # (occurring element, occurrences)
#    for n in k:
#        if n in myMap: myMap[n] += 1
#        else: myMap[n] = 1
#
#        # Keep track of maximum on the go
#        if myMap[n] > maximum[1]: maximum = (n,myMap[n])
#
#    return maximum

for index, row in data.iterrows():
    arr = row["Class"].split("/")
    if(len(set(arr)) > 1):
        #print(arr)
        data = data.drop(index, axis=0)
    #else:
        #data.loc[index]["Class"]=arr[0]
#data.reset_index()

#for index, row in data.iterrows():
#    arr = row["Class"].split("/")
#    #if(len(set(arr)) > 1):
#    if(arr.count('1') > arr.count('0')):
#        #print(arr)
#        data.set_value(index, "Class", 1)
#    elif(arr.count('0') > arr.count('1')):
#        data.set_value(index, "Class", 0)
#    else:
#        data.set_value(index, "Class", 1)
#        #data = data.drop(index, axis=0)
#data = data.reset_index()
data

Unnamed: 0,Class,Comment
0,0/0,Being a member of the European Union is a bit ...
1,0/0,Brexit is bad. Immigrants make Britain great. ...
2,0/0,Britain is basically Pompeii if the Pompeii ha...
3,1/1,Britain's exit is a huge blow to the dream of ...
6,1/1,"Death to the EU, Death to the EU!"
...,...,...
13510,1/1,‚Äúwe have made our choice‚Äù
13511,1/1,‚ÄùFake news! UK will prosper as soon as Bojo si...
13513,1/1,üá¨üáß Hard Brexit all the way üá¨üáß
13514,0/0,"üòÜ it's funny, when the brits sees the raise in..."


In [139]:
from collections import Counter
cnt=Counter(data["Class"])
print("pro-brexit = %0.4f and anti-brexit=%0.4f" %(cnt['0']/(cnt['0']+cnt['1']), cnt['1']/(cnt['0']+cnt['1'])))


#test = Counter(" ".join(data[data["Class"] == '1']["Comment"]).split()).most_common()[:-1000-1:-1]
#test2 = Counter(" ".join(data[data["Class"] == '0']["Comment"]).split()).most_common()[:-1000-1:-1]
#test = Counter(" ".join(data["Comment"]).split()).most_common()[:-1000-1:-1]
test = Counter(" ".join(data["Comment"]).split())

pro-brexit = 0.4969 and anti-brexit=0.5031


In [140]:
len(test.keys())

29687

# 1.2 Lowercasing

In [142]:
data['Comment']=data['Comment'].str.lower()
data

Unnamed: 0,Class,Comment
0,0/0,being a member of the european union is a bit ...
1,0/0,brexit is bad. immigrants make britain great. ...
2,0/0,britain is basically pompeii if the pompeii ha...
3,1/1,britain's exit is a huge blow to the dream of ...
6,1/1,"death to the eu, death to the eu!"
...,...,...
13510,1/1,‚Äúwe have made our choice‚Äù
13511,1/1,‚Äùfake news! uk will prosper as soon as bojo si...
13513,1/1,üá¨üáß hard brexit all the way üá¨üáß
13514,0/0,"üòÜ it's funny, when the brits sees the raise in..."


In [147]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

26031

# 1.3 Tokenization

Tokenization is a step which splits longer strings of text into smaller pieces, or tokens. 

In [148]:
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

for index, row in data.iterrows():
       data.loc[index]['Comment'] = " ".join(nltk.word_tokenize(data['Comment'][index]))
data

Unnamed: 0,Class,Comment
0,0/0,being a member of the european union is a bit ...
1,0/0,brexit is bad . immigrants make britain great ...
2,0/0,britain is basically pompeii if the pompeii ha...
3,1/1,britain 's exit is a huge blow to the dream of...
6,1/1,"death to the eu , death to the eu !"
...,...,...
13510,1/1,‚Äú we have made our choice ‚Äù
13511,1/1,‚Äù fake news ! uk will prosper as soon as bojo ...
13513,1/1,üá¨üáß hard brexit all the way üá¨üáß
13514,0/0,"üòÜ it 's funny , when the brits sees the raise ..."


In [149]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

16071

# 1.4 Remove Punctuation

In [168]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

#print(index)
#data['Comment'][13514].split()
#" ".join(remove_punctuation(data['Comment'][13514].split()))
for index, row in data.iterrows():
    data.loc[index]['Comment'] = " ".join(remove_punctuation(data['Comment'][index].split()))
data  

Unnamed: 0,Class,Comment
0,0/0,being a member of the european union is a bit ...
1,0/0,brexit is bad immigrants make britain great th...
2,0/0,britain is basically pompeii if the pompeii ha...
3,1/1,britain s exit is a huge blow to the dream of ...
6,1/1,death to the eu death to the eu
...,...,...
13510,1/1,we have made our choice
13511,1/1,fake news uk will prosper as soon as bojo sign...
13513,1/1,hard brexit all the way
13514,0/0,it s funny when the brits sees the raise in ta...


In [169]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

15105

# 1.5 replace_numbers
Replace all interger occurrences in list of tokenized words with textual representation

In [171]:
import inflect
def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

for index, row in data.iterrows():
    data.loc[index]['Comment'] = " ".join(replace_numbers(data['Comment'][index].split()))
data  


Unnamed: 0,Class,Comment
0,0/0,being a member of the european union is a bit ...
1,0/0,brexit is bad immigrants make britain great th...
2,0/0,britain is basically pompeii if the pompeii ha...
3,1/1,britain s exit is a huge blow to the dream of ...
6,1/1,death to the eu death to the eu
...,...,...
13510,1/1,we have made our choice
13511,1/1,fake news uk will prosper as soon as bojo sign...
13513,1/1,hard brexit all the way
13514,0/0,it s funny when the brits sees the raise in ta...


In [173]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

14905

# 1.6 remove_non_ascii

In [182]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
for index, row in data.iterrows():
    data.loc[index]['Comment'] = " ".join(remove_non_ascii(data['Comment'][index].split()))
data  



Unnamed: 0,Class,Comment
0,0/0,being a member of the european union is a bit ...
1,0/0,brexit is bad immigrants make britain great th...
2,0/0,britain is basically pompeii if the pompeii ha...
3,1/1,britain s exit is a huge blow to the dream of ...
6,1/1,death to the eu death to the eu
...,...,...
13510,1/1,we have made our choice
13511,1/1,fake news uk will prosper as soon as bojo sign...
13513,1/1,hard brexit all the way
13514,0/0,it s funny when the brits sees the raise in ta...


In [183]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

14897

# 1.7 Remove stop words

In [186]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

for index, row in data.iterrows():
    data.loc[index]['Comment'] = " ".join(remove_stopwords(data['Comment'][index].split()))
data

Unnamed: 0,Class,Comment
0,0/0,member european union bit like going sandwich ...
1,0/0,brexit bad immigrants make britain great also ...
2,0/0,britain basically pompeii pompeii voted volcan...
3,1/1,britain exit huge blow dream united europe end...
6,1/1,death eu death eu
...,...,...
13510,1/1,made choice
13511,1/1,fake news uk prosper soon bojo signes fantasti...
13513,1/1,hard brexit way
13514,0/0,funny brits sees raise taxes commercial negoti...


In [187]:
test = Counter(" ".join(data["Comment"]).split())
len(test.keys())

14749

# 1.8 Stemming and Lemmatization

In [None]:
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

for index, row in data.iterrows():
    data.loc[index]['Comment'] = " ".join(replace_numbers(data['Comment'][index].split()))
data

In [174]:
print([el for el in test.keys() if test[el] >= 15])

['being', 'a', 'member', 'of', 'the', 'european', 'union', 'is', 'bit', 'like', 'going', 'to', 'and', 'three', 'with', 'five', 'getting', 'back', 'over', 'one', 'thousand', 'in', 'change', 'brexit', 'bad', 'immigrants', 'make', 'britain', 'great', 'they', 'also', 'your', 'food', 'london', 'anti', 'on', 'bill', 'basically', 'if', 'had', 'voted', 'for', 'i', 'm', 'dead', 's', 'exit', 'huge', 'blow', 'dream', 'united', 'europe', 'no', 'it', 'end', 'an', 'globalist', 'system', 'eu', 'its', 'power', 'death', 'lost', 'more', 'now', 'has', 'decided', 're', 'happy', 'leave', 'as', 'long', 'can', 'keep', 'our', 'cash', 'use', 'us', 'military', 'willing', 'hand', 'down', 'rules', 'forever', 'by', 'any', 'form', 'british', 'was', 'major', 'still', 'world', 'what', 'me', 'march', 'two', 'nineteen', 'set', 'deal', 'worked', 'does', 'not', 'serve', 'but', 'brussels', 'ca', 'nt', 'begin', 'we', 'public', 'are', 'upon', 'politicians', 'at', 'next', 'general', 'election', 'sell', 'short', 'will', 'be',

In [286]:
#test

In [55]:
#test2

In [56]:
stop_words = []
for x,y in test:
    stop_words.append(x)

ValueError: too many values to unpack (expected 2)

In [49]:
stop_words

NameError: name 'stop_words' is not defined

In [54]:
custom_stop_words = frozenset(stop_words)

In [307]:
#top_words
len(top_words)

5000

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
#stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words)
stop_words = text.ENGLISH_STOP_WORDS.union(top_words)

count = CountVectorizer()
bag = count.fit_transform(list(data["Comment"]))

NameError: name 'top_words' is not defined

In [239]:
d = count.vocabulary_
#sorted(d.items(), key=lambda x: x[1], reverse=True)[:50]

AttributeError: 'list' object has no attribute 'split'

In [335]:
(data[data["Class"] == '1']).count()

Class      5695
Comment    5695
dtype: int64

In [9]:
data["Comment"] = data["Comment"].str.replace('[^\w\s]','')

In [95]:
data

Unnamed: 0,Class,Comment
0,0,Being a member of the European Union is a bit ...
1,0,Brexit is bad. Immigrants make Britain great. ...
2,0,Britain is basically Pompeii if the Pompeii ha...
3,1,Britain's exit is a huge blow to the dream of ...
6,1,"Death to the EU, Death to the EU!"
...,...,...
13510,1,‚Äúwe have made our choice‚Äù
13511,1,‚ÄùFake news! UK will prosper as soon as Bojo si...
13513,1,üá¨üáß Hard Brexit all the way üá¨üáß
13514,0,"üòÜ it's funny, when the brits sees the raise in..."


In [82]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

NameError: name 're' is not defined

In [688]:
Y = data["Class"]
X = data["Comment"]

In [689]:
Xtrain, Xeval, Ytrain, Yeval = train_test_split(X, Y, test_size=0.2)

In [690]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer
v = TfidfVectorizer(max_features=500, strip_accents="unicode", stop_words=stop_words, lowercase=True)
#v = CountVectorizer(stop_words=stop_words, lowercase=True)
X = v.fit_transform(X)
Xtrain = v.transform(Xtrain)
Xeval = v.transform(Xeval)
Ytrain=Ytrain.astype('int')
Yeval=Yeval.astype('int')
Y=Y.astype('int')

#print(vectorizer.get_feature_names())

print(Xtrain.shape)

  'stop_words.' % sorted(inconsistent))


(8803, 500)


In [691]:
from sklearn.svm import SVC
svc = SVC(random_state=0, tol=1e-5)
svc.fit(Xtrain, Ytrain)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=1e-05,
    verbose=False)

In [692]:
from sklearn.metrics import accuracy_score
accuracy_score(svc.predict(Xeval), Yeval)

0.756928668786915

In [693]:
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=1, tol=1e-5, C=0.25)
#svc.fit(Xtrain, Ytrain)

In [694]:
from sklearn.model_selection import cross_val_score
#accuracy_score(svc.predict(Xeval), Yeval)
cross_val_score(svc, X, Y)

array([0.7678328 , 0.72875965, 0.72648796, 0.73739209, 0.74409091])

In [626]:
from sklearn.metrics import confusion_matrix
confusion_matrix(svc.predict(Xeval), Yeval)

array([[828, 230],
       [218, 925]])

In [333]:
#sorted(svc.coef_, reverse=True)
len(svc.coef_[0])

8997

In [468]:
from sklearn.metrics import accuracy_score
accuracy_score(svc.predict(Xeval), Yeval)

0.7991821899136756

In [317]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':np.arange(0.01,100,10)}
svc = GridSearchCV(LinearSVC(),param_grid,cv=5,return_train_score=True)
svc.fit(Xtrain,Ytrain)



GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.000e-02, 1.001e+01, 2.001e+01, 3.001e+01, 4.001e+01, 5.001e+01,
       6.001e+01, 7.001e+01, 8.001e+01, 9.001e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [318]:
print(svc.best_params_)

{'C': 0.01}


In [579]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier()
dummy.fit(Xtrain, Ytrain)



DummyClassifier(constant=None, random_state=None, strategy='warn')

In [580]:
accuracy_score(dummy.predict(Xeval), Yeval)

0.49977283053157656

In [581]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(Xtrain, Ytrain)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [582]:
accuracy_score(gbc.predict(Xeval), Yeval)

0.7233075874602454

In [583]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(Xtrain, Ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [584]:
accuracy_score(nb.predict(Xeval), Yeval)

0.7905497501135847

In [422]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6000)
knn.fit(Xtrain, Ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6000, p=2,
                     weights='uniform')

In [423]:
accuracy_score(knn.predict(Xeval), Yeval)

0.6106315311222171

In [590]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [591]:
lr.fit(Xtrain, Ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [592]:
accuracy_score(lr.predict(Xeval), Yeval)

0.7960018173557474

In [504]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=100, n_estimators=500)
rf.fit(Xtrain, Ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [505]:
rf.feature_names_

AttributeError: 'RandomForestClassifier' object has no attribute 'feature_names_'

In [397]:
accuracy_score(rf.predict(Xeval), Yeval)

0.7737392094502499

In [507]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=80)
tree.fit(Xtrain, Ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=80, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [729]:
#sorted(tree.feature_importances_, reverse=True)
#tree.feature_names_
#en(tree.feature_importances_)

In [388]:
accuracy_score(tree.predict(Xeval), Yeval)

0.6855974557019536

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50, 50, 50), random_state=1)
mlp.fit(Xtrain, Ytrain)

In [746]:
accuracy_score(mlp.predict(Xeval), Yeval)

0.761017719218537

In [561]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=25, n_estimators=100)
clf.fit(Xtrain, Ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [562]:
importances = clf.feature_importances_
indicies = np.argsort(importances)
print(indicies)

[   0 8325 8327 ... 7296 4573 1767]


In [563]:
feature_names = v.get_feature_names()
top_words = []

for i in range(2000):
    top_words.append(feature_names[indicies[i]])
top_words

['00',
 'nationality',
 'nationals',
 'nationwide',
 'native',
 'naturally',
 'naughty',
 'nauseous',
 'naval',
 'navel',
 'navigate',
 'navigation',
 'naysayers',
 'nazi',
 'nazis',
 'na√Øve',
 'ne',
 'neanderthals',
 'nationalities',
 'nearer',
 'nationalists',
 'nationalisation',
 'nafta',
 'nah',
 'nailed',
 'nails',
 'naivety',
 'naked',
 'names',
 'nancy',
 'nannies',
 'napoleon',
 'narcisists',
 'narcissists',
 'narcotic',
 'narratives',
 'nastier',
 'nasties',
 'nastiest',
 'nationalise',
 'nears',
 'necessary',
 'neck',
 'neighbourhood',
 'neighbouring',
 'neil',
 'neill',
 'nemesis',
 'nemo',
 'neo',
 'neoliberal',
 'neoliberalism',
 'nephews',
 'nepotism',
 'nerve',
 'nerves',
 'nervewracking',
 'ness',
 'nest',
 'netflix',
 'neighbour',
 'neighbors',
 'neighbor',
 'neigbours',
 'necks',
 'necrosis',
 'nedxit',
 'needing',
 'needles',
 'nefarious',
 'negation',
 'negatives',
 'myths',
 'negativity',
 'neglect',
 'negligible',
 'negociate',
 'negotations',
 'negotiable',
 'ne

In [362]:
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

m = LinearSVC()
m.fit(Xtrain, Ytrain)

# The estimated coefficients will all be around 1:
#indicies = np.flip(np.argsort(np.abs(m.coef_)))[0]
indicies = np.argsort(np.abs(m.coef_))[0]

# Those values, however, will show that the second parameter
# is more influential
print(np.flip(np.argsort(np.std(Xtrain.todense()) * m.coef_))[0])

[4465 4453 1138 ...  966 4248  772]


In [365]:
feature_names = v.get_feature_names()
top_words = []

for i in range(4000):
    top_words.append(feature_names[indicies[i]])
#top_words