In [1]:
#https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewzimolzak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble


In [290]:
df = pd.read_csv("WikiLarge_Train.csv")

In [141]:
df.sample(15)

Unnamed: 0,original_text,label
252983,bar :1981 at :29985 fontsize : XS text : 29985...,0
71873,They subsequently studied with the Alban Berg ...,1
373203,Construction of the World Trade Center involve...,0
27659,Aubigny is a commune in the Calvados departmen...,1
124976,Carcans is a commune in the Gironde department...,1
349395,"Users assign themselves a user name , log into...",0
3628,1965 & ndash ; Vietnam War : Just miles from D...,1
257252,Candice Michelle Beckman-Ehrlich -LRB- born Se...,0
335170,Noircourt is a commune . It is found in the re...,0
390171,It is also known as La Charte . This was the s...,0


In [142]:
df['label'].value_counts()

0    208384
1    208384
Name: label, dtype: int64

In [346]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(df[["original_text"]],df["label"], \
                                                                   random_state = 0)

In [7]:
train_y

296266    0
354373    0
156421    1
366119    0
193549    1
         ..
359783    0
358083    0
152315    1
117952    1
305711    0
Name: label, Length: 312576, dtype: int64

In [8]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [9]:
len(test_y)

104192

# Word characters, removed stop words

In [10]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
count_vect.fit(train_X)

CountVectorizer(stop_words='english', token_pattern='\\w{1,}')

In [11]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

In [12]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
tfidf_vect.fit(train_X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)


In [13]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', \
                                   ngram_range=(2,3), stop_words="english")
tfidf_vect_ngram.fit(train_X)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

In [14]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', \
                                         ngram_range=(2,3), stop_words="english")
tfidf_vect_ngram_chars.fit(train_X)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 



In [15]:
multi_nb = naive_bayes.MultinomialNB()

In [16]:
multi_nb.fit(xtrain_count,train_y)

MultinomialNB()

In [17]:
prediction_1 = multi_nb.predict(xtest_count)

In [18]:
metrics.accuracy_score(prediction_1, test_y)

#0.5757159858722358

0.5757159858722358

In [19]:
multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
metrics.accuracy_score(prediction_2, test_y)

#0.5789216062653563

0.5105574324324325

In [20]:
multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
metrics.accuracy_score(prediction_3, test_y)

#0.6339066339066339

0.6310561271498771

In [21]:
multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
metrics.accuracy_score(prediction_4, test_y)

#0.5122274262899262

0.5769828777641277

In [22]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)

In [24]:
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6581503378378378
#0.4988770730958231
#0.4982532248157248

0.6581503378378378
0.4988770730958231
0.4982532248157248


In [25]:
clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)

lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)

print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6734490171990172
#0.4988770730958231
#0.4982532248157248

0.6628339987714987
0.4988770730958231
0.4982532248157248


In [26]:
clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)

lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)

print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.5879434121621622
#0.4988770730958231
#0.4982532248157248

0.5452721898034398
0.4988770730958231
0.4982532248157248


In [27]:
clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.7056395884520884
#0.4988770730958231
#0.4982532248157248

0.706512976044226
0.4988770730958231
0.4982532248157248


In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

In [29]:
rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

In [30]:
print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

#0.6214296683046683
#0.6199996160933661
#0.5333998771498771
#0.6966657708845209

0.6214296683046683
0.6262189035626535
0.5389281326781327
0.6858204084766585


# White space, removed stop words

In [31]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*', stop_words="english")
count_vect.fit(train_X)
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', stop_words="english")
tfidf_vect.fit(train_X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3), stop_words="english")
tfidf_vect_ngram.fit(train_X)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3), stop_words="english")
tfidf_vect_ngram_chars.fit(train_X)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 



In [32]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

#0.594700168918919
#0.6422182125307125
#0.6339066339066339
#0.4944045608108108

0.594700168918919
0.620690648034398
0.6310561271498771
0.5915809275184275


In [33]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6806568642506142
#0.4988770730958231
#0.4982532248157248
#0.6882678132678133
#0.4988770730958231
#0.4982532248157248
#0.6896786701474201
#0.4988770730958231
#0.4982532248157248
#0.7056395884520884
#0.4988770730958231
#0.4982532248157248

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6806568642506142
0.4988770730958231
0.4982532248157248
0.6860315571253072
0.4988770730958231
0.4982532248157248
0.7055628071253072
0.4988770730958231
0.4982532248157248
0.706512976044226
0.4988770730958231
0.4982532248157248


In [34]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

#0.6214296683046683
#0.6199996160933661
#0.5333998771498771
#0.6966657708845209

#0.6443680896805897
#0.6655885288697788
#0.6625172757985258
#0.6966657708845209

0.6443680896805897
0.6555493703931204
0.6688421375921376
0.6858204084766585


# Word characters, keep stop words

In [35]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_X)
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(train_X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 



In [36]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

#0.5947961455773956
#0.6427364864864865
#0.6339066339066339
#0.5069199170761671

0.5947961455773956
0.5754376535626535
0.6310561271498771
0.5950744778869779


In [37]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6740344748157249
#0.4988770730958231
#0.4982532248157248
#0.6916173986486487
#0.4988770730958231
#0.4982532248157248
#0.6605401566339066
#0.4988770730958231
#0.4982532248157248
#0.7056395884520884
#0.4988770730958231
#0.4982532248157248

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6740344748157249
0.4988770730958231
0.4982532248157248
0.6853213298525799
0.4988770730958231
0.4982532248157248
0.6590525184275184
0.4988770730958231
0.4982532248157248
0.706512976044226
0.4988770730958231
0.4982532248157248


In [38]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

#0.6400011517199017
#0.655731726044226
#0.601504914004914
#0.6966657708845209

0.6400011517199017
0.6480439957002457
0.6019464066339066
0.6858204084766585


# White space, keep stop words

In [39]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*')
count_vect.fit(train_X)
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*')
tfidf_vect.fit(train_X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 



In [40]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

#0.6096149416461917
#0.6602042383292384
#0.6339066339066339
#0.4916884213759214

0.6096149416461917
0.6354710534398035
0.6310561271498771
0.6089623003685504


In [41]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6935465294840295
#0.4988770730958231
#0.4982532248157248
#0.7001497235872236
#0.4988770730958231
#0.4982532248157248
#0.7017813267813268
#0.4988770730958231
#0.4982532248157248
#0.7056395884520884
#0.4988770730958231
#0.4982532248157248

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6935465294840295
0.4988770730958231
0.4982532248157248
0.7027986793611793
0.4988770730958231
0.4982532248157248
0.718318105036855
0.4988770730958231
0.4982532248157248
0.706512976044226
0.4988770730958231
0.4982532248157248


In [42]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

#0.6505681818181818
#0.6663275491400491
#0.6749270577395577
#0.6966657708845209

0.6505681818181818
0.657228961916462
0.6626132524570024
0.6858204084766585


# Dale-Chall

In [43]:
file = open('dale_chall.txt', 'r')
dc_vocab = []
for line in file:
    dc_vocab.append(line.strip())

In [44]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*', vocabulary = set(dc_vocab))
count_vect.fit(train_X)
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', vocabulary = set(dc_vocab))
tfidf_vect.fit(train_X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3), vocabulary = set(dc_vocab))
tfidf_vect_ngram.fit(train_X)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3), vocabulary = set(dc_vocab))
tfidf_vect_ngram_chars.fit(train_X)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 



In [45]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

#0.6187615171990172
#0.4982532248157248
#0.5761382831695332
#0.4978117321867322

0.6187615171990172
0.4982532248157248
0.5761382831695332
0.6200476044226044


In [46]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

#0.6511920300982801
#0.4988770730958231
#0.4982532248157248
#0.6513167997542998
#0.4988770730958231
#0.4982532248157248
#0.4982532248157248
#0.4988770730958231
#0.4982532248157248
#0.6352407094594594
#0.4988770730958231
#0.4982532248157248

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6511920300982801
0.4988770730958231
0.4982532248157248
0.6513167997542998
0.4988770730958231
0.4982532248157248
0.4982532248157248
0.4988770730958231
0.4982532248157248
0.6352407094594594
0.4988770730958231
0.4982532248157248


In [47]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

#0.6377457002457002
#0.6379568488943489
#0.4982532248157248
#0.6438594133906634

0.6377457002457002
0.6379568488943489
0.4982532248157248
0.6438594133906634


In [48]:
from collections import Counter

In [292]:
#df['dc_proportion'] = 0


#for i in range(len(df)):
    #split = df['original_text'][i].split()
    #counter = Counter(split)

    #counts = 0
    #for word in counter.keys():
        #if word in dc_vocab:
            #counts += 1
    #df['dc_proportion'].iloc[i] = counts/len(split) #0.3023255813953488
    
def dc_percentage(dc_counter, s):
    split = s.split()
    counter = Counter(split)
    both = counter & dc_counter
    counts = 0
    for word in both.keys():
        counts += counter[word]
    return counts/len(split)

dc_counter = Counter(dc_vocab)
result = [dc_percentage(dc_counter, s) for s in df['original_text']]
df['dc_proportion'] = result

In [144]:
df

Unnamed: 0,original_text,label,dc_proportion
0,There is manuscript evidence that Austen conti...,1,0.465116
1,"In a remarkable comparative analysis , Mandaea...",1,0.217391
2,"Before Persephone was released to Hermes , who...",1,0.543478
3,Cogeneration plants are commonly found in dist...,1,0.256410
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,0.388889
...,...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0,0.352941
416764,"However , it is becoming replaced as a method ...",0,0.550000
416765,There are hand gestures in both Hindu and Budd...,0,0.454545
416766,"If it is necessary to use colors , try to choo...",0,0.595238


In [145]:
train_X, test_X, train_y, test_y = \
model_selection.train_test_split(df[["original_text", 'dc_proportion']],df["label"], random_state = 0)

In [146]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*')
count_vect.fit(train_X['original_text'])
xtrain_count =  count_vect.transform(train_X['original_text'])
xtest_count =  count_vect.transform(test_X['original_text'])

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*')
tfidf_vect.fit(train_X['original_text'])
xtrain_tfidf =  tfidf_vect.transform(train_X['original_text'])
xtest_tfidf =  tfidf_vect.transform(test_X['original_text'])

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X['original_text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X['original_text'])
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X['original_text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X['original_text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X['original_text']) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X['original_text']) 



In [147]:
import scipy.sparse
from scipy.sparse import hstack

xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

In [148]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

0.6113905098280098
0.6361908783783784
0.6317567567567568
0.6094709766584766


In [149]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6941703777641277
0.4988770730958231
0.4982532248157248
0.7028850583538083
0.4988770730958231
0.4982532248157248
0.7177230497542998
0.4988770730958231
0.4982532248157248
0.7062922297297297
0.4988770730958231
0.4982532248157248


In [150]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

0.6532747235872236
0.6657804821867321
0.6645903716216216
0.6865594287469288


In [328]:
AoA = pd.read_csv('AoA_51715_words.csv', encoding = 'unicode_escape')
AoA_vocab = AoA['Word'].unique()

In [293]:
AoA_counter = Counter(AoA_vocab)
result = [dc_percentage(AoA_counter, s) for s in df['original_text']]
df['AoA_proportion'] = result

In [178]:
df

Unnamed: 0,original_text,label,dc_proportion,AoA_proportion
0,There is manuscript evidence that Austen conti...,1,0.465116,0.651163
1,"In a remarkable comparative analysis , Mandaea...",1,0.217391,0.521739
2,"Before Persephone was released to Hermes , who...",1,0.543478,0.782609
3,Cogeneration plants are commonly found in dist...,1,0.256410,0.692308
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,0.388889,0.416667
...,...,...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0,0.352941,0.352941
416764,"However , it is becoming replaced as a method ...",0,0.550000,0.750000
416765,There are hand gestures in both Hindu and Budd...,0,0.454545,0.636364
416766,"If it is necessary to use colors , try to choo...",0,0.595238,0.809524


In [179]:
train_X, test_X, train_y, test_y = \
model_selection.train_test_split( \
df[["original_text", 'dc_proportion', 'AoA_proportion']],df["label"], random_state = 0)

In [180]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*')
count_vect.fit(train_X['original_text'])
xtrain_count =  count_vect.transform(train_X['original_text'])
xtest_count =  count_vect.transform(test_X['original_text'])

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*')
tfidf_vect.fit(train_X['original_text'])
xtrain_tfidf =  tfidf_vect.transform(train_X['original_text'])
xtest_tfidf =  tfidf_vect.transform(test_X['original_text'])

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X['original_text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X['original_text'])
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X['original_text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X['original_text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X['original_text']) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X['original_text']) 



In [183]:
xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

In [184]:
xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

In [185]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

0.6142506142506142
0.6361620853808354
0.6332827856265356
0.6094805743243243


In [186]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6944295147420148
0.4988770730958231
0.4982532248157248
0.7038736179361179
0.4988770730958231
0.4982532248157248
0.7194794226044227
0.4988770730958231
0.4982532248157248
0.7070408476658476
0.4988770730958231
0.4982532248157248


In [187]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

0.6509041001228502
0.6555109797297297
0.6602618243243243
0.6840928286240786


In [189]:
#multinomial naive bayes - training data, check for overfitting
prediction_1 = multi_nb.predict(xtrain_count)
print(metrics.accuracy_score(prediction_1, train_y))

prediction_2 = multi_nb2.predict(xtrain_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, train_y))

prediction_3 = multi_nb3.predict(xtrain_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, train_y))

prediction_4 = multi_nb4.predict(xtrain_tfidf)
print(metrics.accuracy_score(prediction_4, train_y))

0.7346309377559378
0.7998694717444718
0.6546983773546273
0.7424338400900901


In [190]:
clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtrain_count)
print(metrics.accuracy_score(lr_preds, train_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtrain_tfidf)
print(metrics.accuracy_score(lr_preds, train_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtrain_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, train_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtrain_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, train_y))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7464456644144144
0.7486659244471745
0.8076883701883701
0.7247485411547911


In [191]:
rf_count_preds = rf_clf_count.predict(xtrain_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtrain_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtrain_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtrain_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, train_y))
print(metrics.accuracy_score(rf_tfidf_preds, train_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, train_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, train_y))

0.6573249385749386
0.6623253224815725
0.671196764946765
0.7317356418918919


In [192]:
from sklearn.decomposition import LatentDirichletAllocation

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
tf_documents = count_vect.fit_transform(df['original_text'])
tf_feature_names = count_vect.get_feature_names()

In [193]:
lda = LatentDirichletAllocation(n_components = 10, random_state = 0)
lda.fit(tf_documents)
topic_models = lda.components_

In [194]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        term_list = [feature_names[i]
                    for i in topic.argsort()[:-no_top_words - 1:-1]]
        print("topic %d:" % (topic_idx), term_list)

display_topics(lda, tf_feature_names, 10)

topic 0: ['s', 'united', 'university', 'states', 'state', 'school', 'government', 'new', 'president', 'college']
topic 1: ['team', 'football', 'national', 'league', 'season', 'lrb', 'rrb', 'world', 'played', 'club']
topic 2: ['lrb', 'rrb', 'used', 's', 'use', 'called', 'water', 'number', 'usually', 'term']
topic 3: ['music', 'band', 'album', 'new', 'released', 's', 'rock', 'song', 'tropical', 'hurricane']
topic 4: ['lrb', 'rrb', 'war', 's', 'world', 'years', 'species', 'people', 'century', 'family']
topic 5: ['france', 'department', 'region', 'commune', 'calais', 'pas', 'north', 'la', 'saint', 'northern']
topic 6: ['rrb', 'lrb', 'born', '1', 'football', 'player', 'american', '2', '4', 'january']
topic 7: ['s', 'series', 'lrb', 'rrb', 'film', 'book', 'television', 'known', 'written', 'john']
topic 8: ['lrb', 'rrb', 'district', 'o', 'province', 'located', 'municipality', 'town', 'language', 'city']
topic 9: ['city', 'states', 'united', 'county', 's', 'river', 'game', 'people', 'north', '

In [196]:
from sklearn.decomposition import NMF

nmf = NMF(n_components = 10, random_state = 0, init = "nndsvd")
W = nmf.fit_transform(tf_documents)
H = nmf.components_

In [197]:
for topic_index in range(0, 10):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:10]:
        top_terms.append(tf_feature_names[term_index])
    print("topic %d:" % (topic_index), top_terms)

topic 0: ['rrb', 'lrb', 'o', 'd', 'â', 'known', 'called', 'km', 'german', 'english']
topic 1: ['s', 'u', 'album', 'women', 'state', 'band', 'second', 'film', 'children', 'death']
topic 2: ['france', 'department', 'commune', 'region', 'la', 'calvados', 'normandie', 'basse', 'aisne', 'northern']
topic 3: ['united', 'states', 'county', 'kingdom', 'president', 'iowa', 'state', 'canada', 'america', 'nations']
topic 4: ['born', 'american', 'player', 'january', 'september', 'march', 'july', 'april', 'february', 'footballer']
topic 5: ['1', '4', '2', 'ð', '3', 'â', 'î', '5', '0', 'ñ']
topic 6: ['city', 'county', 'capital', 'state', 'north', 'located', 'district', 'largest', 'south', 'area']
topic 7: ['pas', 'calais', 'nord', 'region', 'department', 'france', 'north', 'commune', 'l', 'ã']
topic 8: ['new', 'world', 'known', 'york', 'used', 'war', 'time', 'called', 'south', 'best']
topic 9: ['football', 'national', 'team', 'player', 'league', 'club', 'played', 'japanese', 'plays', 'hockey']


In [199]:
lda = LatentDirichletAllocation(n_components = 20, random_state = 0)
lda.fit(tf_documents)
topic_models = lda.components_

display_topics(lda, tf_feature_names, 10)

topic 0: ['united', 's', 'states', 'school', 'state', 'president', 'university', 'government', 'college', 'war']
topic 1: ['lrb', 'rrb', 'won', 'formula', 'world', 'championship', 'grand', 'race', 'season', 'car']
topic 2: ['lrb', 'rrb', 'university', 's', 'used', 'energy', 'called', 'earth', 'known', 'light']
topic 3: ['band', 'music', 'album', 'released', 'rock', 'song', 's', 'american', 'singer', 'lrb']
topic 4: ['lrb', 'rrb', 'species', 's', 'family', 'long', 'war', '000', 'like', 'years']
topic 5: ['france', 'department', 'commune', 'region', 'la', 'saint', 'northern', 'north', 'ã', 'aisne']
topic 6: ['rrb', 'lrb', 'born', '1', '2', '4', '3', 'â', 'january', 'american']
topic 7: ['s', 'series', 'film', 'lrb', 'rrb', 'book', 'television', 'match', 't', 'written']
topic 8: ['lrb', 'rrb', 'district', 'province', 'germany', 'municipality', 'pakistan', 'capital', 'city', 'switzerland']
topic 9: ['states', 'united', 'city', 'county', 'game', 's', 'state', 'video', 'florida', 'u']
topic 

In [200]:
nmf = NMF(n_components = 20, random_state = 0, init = "nndsvd")
W = nmf.fit_transform(tf_documents)
H = nmf.components_

for topic_index in range(0, 20):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:10]:
        top_terms.append(tf_feature_names[term_index])
    print("topic %d:" % (topic_index), top_terms)



topic 0: ['rrb', 'lrb', 'd', 'km', 'german', 'e', 'english', 'french', 'c', 'ndash']
topic 1: ['s', 'u', 'women', 'state', 'children', 'st', 'death', 'father', 'book', 'king']
topic 2: ['france', 'department', 'commune', 'region', 'la', 'calvados', 'normandie', 'basse', 'aisne', 'northern']
topic 3: ['united', 'states', 'county', 'kingdom', 'president', 'iowa', 'state', 'canada', 'nations', 'government']
topic 4: ['born', 'player', 'football', 'january', 'footballer', 'september', 'july', 'march', 'april', 'plays']
topic 5: ['1', '2', '4', 'î', '3', '5', '0', 'ï', '6', '000']
topic 6: ['city', 'county', 'capital', 'largest', 'state', 'located', 'population', 'area', 'iowa', 'seat']
topic 7: ['pas', 'calais', 'nord', 'region', 'department', 'france', 'commune', 'north', 'l', 'ã']
topic 8: ['called', 'people', 'usually', 'group', 'language', 'family', 'area', 'small', 'like', 'english']
topic 9: ['football', 'national', 'team', 'league', 'player', 'club', 'played', 'plays', 'hockey', 'ja

In [209]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english", \
                            max_df = 0.9)
tf_documents = count_vect.fit_transform(df['original_text'])
tf_feature_names = count_vect.get_feature_names()

lda = LatentDirichletAllocation(n_components = 10, random_state = 0)
lda.fit(tf_documents)
topic_models = lda.components_

display_topics(lda, tf_feature_names, 10)

topic 0: ['s', 'united', 'university', 'states', 'state', 'school', 'government', 'new', 'president', 'college']
topic 1: ['team', 'football', 'national', 'league', 'season', 'lrb', 'rrb', 'world', 'played', 'club']
topic 2: ['lrb', 'rrb', 'used', 's', 'use', 'called', 'water', 'number', 'usually', 'term']
topic 3: ['music', 'band', 'album', 'new', 'released', 's', 'rock', 'song', 'tropical', 'hurricane']
topic 4: ['lrb', 'rrb', 'war', 's', 'world', 'years', 'species', 'people', 'century', 'family']
topic 5: ['france', 'department', 'region', 'commune', 'calais', 'pas', 'north', 'la', 'saint', 'northern']
topic 6: ['rrb', 'lrb', 'born', '1', 'football', 'player', 'american', '2', '4', 'january']
topic 7: ['s', 'series', 'lrb', 'rrb', 'film', 'book', 'television', 'known', 'written', 'john']
topic 8: ['lrb', 'rrb', 'district', 'o', 'province', 'located', 'municipality', 'town', 'language', 'city']
topic 9: ['city', 'states', 'united', 'county', 's', 'river', 'game', 'people', 'north', '

In [210]:
nmf = NMF(n_components = 10, random_state = 0, init = "nndsvd")
W = nmf.fit_transform(tf_documents)
H = nmf.components_

for topic_index in range(0, 10):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:10]:
        top_terms.append(tf_feature_names[term_index])
    print("topic %d:" % (topic_index), top_terms)

topic 0: ['rrb', 'lrb', 'o', 'd', 'â', 'known', 'called', 'km', 'german', 'english']
topic 1: ['s', 'u', 'album', 'women', 'state', 'band', 'second', 'film', 'children', 'death']
topic 2: ['france', 'department', 'commune', 'region', 'la', 'calvados', 'normandie', 'basse', 'aisne', 'northern']
topic 3: ['united', 'states', 'county', 'kingdom', 'president', 'iowa', 'state', 'canada', 'america', 'nations']
topic 4: ['born', 'american', 'player', 'january', 'september', 'march', 'july', 'april', 'february', 'footballer']
topic 5: ['1', '4', '2', 'ð', '3', 'â', 'î', '5', '0', 'ñ']
topic 6: ['city', 'county', 'capital', 'state', 'north', 'located', 'district', 'largest', 'south', 'area']
topic 7: ['pas', 'calais', 'nord', 'region', 'department', 'france', 'north', 'commune', 'l', 'ã']
topic 8: ['new', 'world', 'known', 'york', 'used', 'war', 'time', 'called', 'south', 'best']
topic 9: ['football', 'national', 'team', 'player', 'league', 'club', 'played', 'japanese', 'plays', 'hockey']


In [211]:
lda = LatentDirichletAllocation(n_components = 20, random_state = 0)
lda.fit(tf_documents)
topic_models = lda.components_

display_topics(lda, tf_feature_names, 10)

topic 0: ['united', 's', 'states', 'school', 'state', 'president', 'university', 'government', 'college', 'war']
topic 1: ['lrb', 'rrb', 'won', 'formula', 'world', 'championship', 'grand', 'race', 'season', 'car']
topic 2: ['lrb', 'rrb', 'university', 's', 'used', 'energy', 'called', 'earth', 'known', 'light']
topic 3: ['band', 'music', 'album', 'released', 'rock', 'song', 's', 'american', 'singer', 'lrb']
topic 4: ['lrb', 'rrb', 'species', 's', 'family', 'long', 'war', '000', 'like', 'years']
topic 5: ['france', 'department', 'commune', 'region', 'la', 'saint', 'northern', 'north', 'ã', 'aisne']
topic 6: ['rrb', 'lrb', 'born', '1', '2', '4', '3', 'â', 'january', 'american']
topic 7: ['s', 'series', 'film', 'lrb', 'rrb', 'book', 'television', 'match', 't', 'written']
topic 8: ['lrb', 'rrb', 'district', 'province', 'germany', 'municipality', 'pakistan', 'capital', 'city', 'switzerland']
topic 9: ['states', 'united', 'city', 'county', 'game', 's', 'state', 'video', 'florida', 'u']
topic 

In [213]:
nmf = NMF(n_components = 20, random_state = 0, init = "nndsvd")
W = nmf.fit_transform(tf_documents)
H = nmf.components_

for topic_index in range(0, 20):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:10]:
        top_terms.append(tf_feature_names[term_index])
    print("topic %d:" % (topic_index), top_terms)



topic 0: ['rrb', 'lrb', 'd', 'km', 'german', 'e', 'english', 'french', 'c', 'ndash']
topic 1: ['s', 'u', 'women', 'state', 'children', 'st', 'death', 'father', 'book', 'king']
topic 2: ['france', 'department', 'commune', 'region', 'la', 'calvados', 'normandie', 'basse', 'aisne', 'northern']
topic 3: ['united', 'states', 'county', 'kingdom', 'president', 'iowa', 'state', 'canada', 'nations', 'government']
topic 4: ['born', 'player', 'football', 'january', 'footballer', 'september', 'july', 'march', 'april', 'plays']
topic 5: ['1', '2', '4', 'î', '3', '5', '0', 'ï', '6', '000']
topic 6: ['city', 'county', 'capital', 'largest', 'state', 'located', 'population', 'area', 'iowa', 'seat']
topic 7: ['pas', 'calais', 'nord', 'region', 'department', 'france', 'commune', 'north', 'l', 'ã']
topic 8: ['called', 'people', 'usually', 'group', 'language', 'family', 'area', 'small', 'like', 'english']
topic 9: ['football', 'national', 'team', 'league', 'player', 'club', 'played', 'plays', 'hockey', 'ja

In [231]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.manifold import MDS
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.pyplot as plt

In [217]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
tfidf_documents = tfidf_vect.fit_transform(df['original_text'])

In [247]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans.fit(tfidf_documents)

#tfidf_documents_normalized = MaxAbsScaler().fit(tfidf_documents).transform(tfidf_documents)

In [248]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect.get_feature_names()

In [249]:
for i in range(10):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 city
 used
 new
 people
 world
 references
 called
 known
 time
 1
Cluster 1:
 football
 player
 born
 rrb
 lrb
 japanese
 team
 national
 club
 plays
Cluster 2:
 district
 municipality
 canton
 switzerland
 province
 located
 belgian
 aargau
 pakistan
 ticino
Cluster 3:
 s
 u
 rrb
 lrb
 world
 state
 city
 new
 county
 time
Cluster 4:
 department
 france
 commune
 region
 aisne
 calvados
 normandie
 basse
 picardie
 gironde
Cluster 5:
 series
 television
 game
 character
 s
 animated
 tv
 american
 rrb
 lrb
Cluster 6:
 rrb
 lrb
 born
 o
 american
 known
 1
 d
 â
 english
Cluster 7:
 pas
 calais
 nord
 department
 region
 france
 commune
 north
 ã
 l
Cluster 8:
 united
 states
 city
 county
 iowa
 kentucky
 state
 kingdom
 illinois
 florida
Cluster 9:
 released
 album
 tropical
 hurricane
 storm
 band
 single
 studio
 atlantic
 song


In [258]:
topic_vecs = lda.transform(tf_documents)

In [279]:
lda_20_feature = np.argmax(topic_vecs, axis = 1)

In [280]:
df['lda_20'] = lda_20_feature

In [281]:
train_X, test_X, train_y, test_y = \
model_selection.train_test_split( \
df[["original_text", 'dc_proportion', 'AoA_proportion', 'lda_20']],df["label"], random_state = 0)

In [282]:
#count
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*')
count_vect.fit(train_X['original_text'])
xtrain_count =  count_vect.transform(train_X['original_text'])
xtest_count =  count_vect.transform(test_X['original_text'])

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*')
tfidf_vect.fit(train_X['original_text'])
xtrain_tfidf =  tfidf_vect.transform(train_X['original_text'])
xtest_tfidf =  tfidf_vect.transform(test_X['original_text'])

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X['original_text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X['original_text'])
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X['original_text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X['original_text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X['original_text']) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X['original_text']) 

In [283]:
xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['dc_proportion']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['dc_proportion']).T]))

In [284]:
xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['AoA_proportion']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['AoA_proportion']).T]))

In [285]:
xtrain_count = scipy.sparse.csr_matrix(hstack([xtrain_count, scipy.sparse.csr_matrix(train_X['lda_20']).T]))
xtest_count = scipy.sparse.csr_matrix(hstack([xtest_count, scipy.sparse.csr_matrix(test_X['lda_20']).T]))

xtrain_tfidf = scipy.sparse.csr_matrix(hstack([xtrain_tfidf, scipy.sparse.csr_matrix(train_X['lda_20']).T]))
xtest_tfidf = scipy.sparse.csr_matrix(hstack([xtest_tfidf, scipy.sparse.csr_matrix(test_X['lda_20']).T]))

xtrain_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram, scipy.sparse.csr_matrix(train_X['lda_20']).T]))
xtest_tfidf_ngram = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram, scipy.sparse.csr_matrix(test_X['lda_20']).T]))

xtrain_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtrain_tfidf_ngram_chars, scipy.sparse.csr_matrix(train_X['lda_20']).T]))
xtest_tfidf_ngram_chars = scipy.sparse.csr_matrix(hstack([xtest_tfidf_ngram_chars, scipy.sparse.csr_matrix(test_X['lda_20']).T]))

In [286]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

0.6207386363636364
0.6468058968058968
0.6030213452088452
0.6040770884520884


In [287]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6980286394348895
0.4988770730958231
0.4982532248157248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7045454545454546
0.4988770730958231
0.4982532248157248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7187020116707616
0.4988770730958231
0.4982532248157248
0.7061002764127764
0.4988770730958231
0.4982532248157248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [288]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

0.6542824785012284
0.6651854269041769
0.6578912008599509
0.6860699477886978


In [299]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
tf_documents = count_vect.fit_transform(df['original_text'])

In [300]:
lda_20 = LatentDirichletAllocation(n_components = 20, random_state = 0)
topic_vecs = lda_20.fit_transform(tf_documents)
lda_20_feature = np.argmax(topic_vecs, axis = 1)
df['lda_20'] = lda_20_feature

In [301]:
lda_10 = LatentDirichletAllocation(n_components = 10, random_state = 0)
topic_vecs = lda_10.fit_transform(tf_documents)
lda_10_feature = np.argmax(topic_vecs, axis = 1)
df['lda_10'] = lda_10_feature

In [310]:
nmf_10 = NMF(n_components = 10, random_state = 0, init = "nndsvd")
W_10 = nmf_10.fit_transform(tf_documents)
H_10 = nmf_10.components_
nmf_10_feature = np.argmax(W_10, axis = 1)
df['nmf_10'] = lda_10_feature


In [311]:
nmf_20 = NMF(n_components = 20, random_state = 0, init = "nndsvd")
W_20 = nmf_20.fit_transform(tf_documents)
H_20 = nmf_20.components_
nmf_20_feature = np.argmax(W_20, axis = 1)
df['nmf_20'] = lda_20_feature



In [314]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words="english")
tfidf_documents = tfidf_vect.fit_transform(df['original_text'])
kmeans_10 = KMeans(n_clusters = 10, random_state = 0)
kmeans_10.fit(tfidf_documents)
kmeans_10_feature = kmeans_10.labels_
df['kmeans_10'] = kmeans_10_feature


In [315]:
kmeans_20 = KMeans(n_clusters = 20, random_state = 0)
kmeans_20.fit(tfidf_documents)
kmeans_20_feature = kmeans_20.labels_
df['kmeans_20'] = kmeans_20_feature

In [317]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\S*')
count_vect.fit(train_X['original_text'])
xtrain_count =  count_vect.transform(train_X['original_text'])
xtest_count =  count_vect.transform(test_X['original_text'])

#tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\S*')
tfidf_vect.fit(train_X['original_text'])
xtrain_tfidf =  tfidf_vect.transform(train_X['original_text'])
xtest_tfidf =  tfidf_vect.transform(test_X['original_text'])

#tfidf n-gram
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\S*', \
                                   ngram_range=(2,3))
tfidf_vect_ngram.fit(train_X['original_text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X['original_text'])
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X['original_text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\S*', \
                                         ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_X['original_text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X['original_text']) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X['original_text']) 



In [348]:
train_X, test_X, train_y, test_y = \
model_selection.train_test_split( \
df[["original_text", 'dc_proportion', 'AoA_proportion', 'lda_10', 'lda_20', 'nmf_10', 'nmf_20', \
    'kmeans_10', 'kmeans_20']],df["label"], random_state = 0)

In [349]:
def add_feature_to_matrix(train_matrix, test_matrix, feature):
    train_matrix = scipy.sparse.csr_matrix(hstack([train_matrix, scipy.sparse.csr_matrix(train_X[feature]).T]))
    test_matrix = scipy.sparse.csr_matrix(hstack([test_matrix, scipy.sparse.csr_matrix(test_X[feature]).T]))

matrix_pairs = [(xtrain_count, xtest_count),(xtrain_tfidf, xtest_tfidf), \
               (xtrain_tfidf_ngram, xtest_tfidf_ngram), (xtrain_tfidf_ngram_chars, xtest_tfidf_ngram_chars)]     

feature_list = ['dc_proportion', 'AoA_proportion', 'lda_10', 'lda_20', 'nmf_10', 'nmf_20', 'kmeans_10', 'kmeans_20']

for matrix_pair in matrix_pairs:
    for feature in feature_list:
        add_feature_to_matrix(matrix_pair[0], matrix_pair[1], feature)

In [350]:
xtrain_count.shape

(312576, 155410)

In [324]:
#multinomial naive bayes

multi_nb = naive_bayes.MultinomialNB()
multi_nb.fit(xtrain_count,train_y)
prediction_1 = multi_nb.predict(xtest_count)
print(metrics.accuracy_score(prediction_1, test_y))

multi_nb2 = naive_bayes.MultinomialNB()
multi_nb2.fit(xtrain_tfidf_ngram,train_y)
prediction_2 = multi_nb2.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(prediction_2, test_y))

multi_nb3 = naive_bayes.MultinomialNB()
multi_nb3.fit(xtrain_tfidf_ngram_chars,train_y)
prediction_3 = multi_nb3.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(prediction_3, test_y))

multi_nb4 = naive_bayes.MultinomialNB()
multi_nb4.fit(xtrain_tfidf,train_y)
prediction_4 = multi_nb4.predict(xtest_tfidf)
print(metrics.accuracy_score(prediction_4, test_y))

0.6096149416461917
0.6354710534398035
0.6310561271498771
0.6089623003685504


In [325]:
#baseline

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_count, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_count, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_count, train_y)
lr_preds = clf.predict(xtest_count)
rand_dev_preds = uniform.predict(xtest_count)
mf_dev_preds = most_frequent.predict(xtest_count)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf, train_y)
lr_preds = clf.predict(xtest_tfidf)
rand_dev_preds = uniform.predict(xtest_tfidf)
mf_dev_preds = most_frequent.predict(xtest_tfidf)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))

clf = LogisticRegression(random_state = 0, max_iter = 500).fit(xtrain_tfidf_ngram_chars, train_y)
uniform = DummyClassifier(strategy = 'uniform', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
most_frequent = DummyClassifier(strategy = 'most_frequent', random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)
lr_preds = clf.predict(xtest_tfidf_ngram_chars)
rand_dev_preds = uniform.predict(xtest_tfidf_ngram_chars)
mf_dev_preds = most_frequent.predict(xtest_tfidf_ngram_chars)
print(metrics.accuracy_score(lr_preds, test_y))
print(metrics.accuracy_score(rand_dev_preds, test_y))
print(metrics.accuracy_score(mf_dev_preds, test_y))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6935465294840295
0.4988770730958231
0.4982532248157248
0.7027986793611793
0.4988770730958231
0.4982532248157248
0.718318105036855
0.4988770730958231
0.4982532248157248
0.706512976044226
0.4988770730958231
0.4982532248157248


In [326]:
#rf

rf_clf_count = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_count, train_y)
rf_clf_tfidf = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf, train_y)
rf_clf_tfidf_ngram = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram, train_y)
rf_clf_tfidf_ngram_chars = RandomForestClassifier(n_estimators = 50, max_depth = 15, random_state = 0).fit(xtrain_tfidf_ngram_chars, train_y)

rf_count_preds = rf_clf_count.predict(xtest_count)
rf_tfidf_preds = rf_clf_tfidf.predict(xtest_tfidf)
rf_tfidf_ngram_preds = rf_clf_tfidf_ngram.predict(xtest_tfidf_ngram)
rf_tfidf_ngram_chars_preds = rf_clf_tfidf_ngram_chars.predict(xtest_tfidf_ngram_chars)

print(metrics.accuracy_score(rf_count_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_preds, test_y))
print(metrics.accuracy_score(rf_tfidf_ngram_chars_preds, test_y))

0.6505681818181818
0.657228961916462
0.6626132524570024
0.6858204084766585


Unnamed: 0,original_text,label,dc_proportion,AoA_proportion,lda_20,lda_10,nmf_10,nmf_20,kmeans_10,kmeans_20
0,There is manuscript evidence that Austen conti...,1,0.465116,0.651163,7,7,7,7,0,7
1,"In a remarkable comparative analysis , Mandaea...",1,0.217391,0.521739,15,2,2,15,3,11
2,"Before Persephone was released to Hermes , who...",1,0.543478,0.782609,13,7,7,13,6,19
3,Cogeneration plants are commonly found in dist...,1,0.256410,0.692308,2,2,2,2,0,7
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,0.388889,0.416667,8,8,8,8,6,19
...,...,...,...,...,...,...,...,...,...,...
416763,A Duke Nukem 3D version has been sold for Xbox...,0,0.352941,0.352941,13,9,9,13,0,7
416764,"However , it is becoming replaced as a method ...",0,0.550000,0.750000,0,0,0,0,8,3
416765,There are hand gestures in both Hindu and Budd...,0,0.454545,0.636364,4,2,2,4,0,7
416766,"If it is necessary to use colors , try to choo...",0,0.595238,0.809524,17,2,2,17,6,19


In [345]:
def AoA_sum(AoA_counter, s):
    split = s.split()
    counter = Counter(split)
    both = counter & AoA_counter
    AoA_sum = 0
    for word in both.keys():
        AoA_sum += np.float(AoA.loc[AoA.Word == word]['AoA_Kup_lem'])
    return AoA_sum

AoA_counter = Counter(AoA_vocab)
result = [AoA_sum(AoA_counter, s) for s in df['original_text']]
#df['dc_proportion'] = result

KeyboardInterrupt: 

[128.29000000000002]

In [338]:
df[:1]

Unnamed: 0,original_text,label,dc_proportion,AoA_proportion,lda_20,lda_10,nmf_10,nmf_20,kmeans_10,kmeans_20
0,There is manuscript evidence that Austen conti...,1,0.465116,0.651163,7,7,7,7,0,7


In [334]:
AoA.loc[AoA.Word == 'a']

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
