In [1]:
import pandas as pd
import re
import functions
import nltk
import sklearn
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report
from joblib import dump

In [2]:
train_data = pd.read_pickle("data/train_post_preprocessed2.pkl")

In [3]:
test_data = pd.read_pickle('data/test_preprocessed.pkl')

# Bag of Words
## 1-gram

In [8]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=0.0001, stop_words='english')

In [15]:
X_train = vectorizer.fit_transform([' '.join(post) for post in train_data.text.values[:100000]])
Y_train = train_data.label.values[:100000]
X_test = vectorizer.transform([' '.join(post) for post in test_data.text.values])
Y_test = test_data.label.values

In [16]:
len(vectorizer.get_feature_names())

28268

In [17]:
clf = LogisticRegression(C=5,class_weight='balanced', solver='newton-cg',multi_class='multinomial', n_jobs=-1,\
                         random_state=40, verbose=1, penalty='l2')
clf.fit(X_train, Y_train)    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  5.6min finished


LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l2',
          random_state=40, solver='newton-cg', tol=0.0001, verbose=1,
          warm_start=False)

In [12]:
clf.score(X_train,Y_train)

0.9937

In [13]:
clf.score(X_test, Y_test)

0.48276

In [14]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.60      0.59      0.59     16898
           1       0.54      0.40      0.46     22350
           2       0.32      0.49      0.39     10752

   micro avg       0.48      0.48      0.48     50000
   macro avg       0.48      0.49      0.48     50000
weighted avg       0.51      0.48      0.49     50000



## 1-gram and 2-gram

In [19]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=0.0001, stop_words='english', ngram_range=(1,2))

In [20]:
X_train = vectorizer.fit_transform([' '.join(post) for post in train_data.text.values[:100000]])
Y_train = train_data.label.values[:100000]
X_test = vectorizer.transform([' '.join(post) for post in test_data.text.values])
Y_test = test_data.label.values

In [21]:
len(vectorizer.get_feature_names())

110838

In [22]:
clf = LogisticRegression(C=5,class_weight='balanced', solver='newton-cg',multi_class='multinomial', n_jobs=-1,\
                         random_state=40, verbose=1, penalty='l2')
clf.fit(X_train, Y_train)    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  8.7min finished


LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l2',
          random_state=40, solver='newton-cg', tol=0.0001, verbose=1,
          warm_start=False)

In [23]:
clf.score(X_train,Y_train)

0.98358

In [24]:
clf.score(X_test, Y_test)

0.59626

In [25]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.67      0.72      0.69     16898
           1       0.61      0.60      0.60     22350
           2       0.44      0.40      0.42     10752

   micro avg       0.60      0.60      0.60     50000
   macro avg       0.57      0.57      0.57     50000
weighted avg       0.59      0.60      0.59     50000



# TF-IDF
## 1-gram

In [62]:
ectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=0.0001, stop_words='english', ngram_range=(1,1))

In [63]:
X_train = vectorizer.fit_transform([' '.join(post) for post in train_data.text.values])
Y_train = train_data.label.values
X_test = vectorizer.transform([' '.join(post) for post in test_data.text.values])
Y_test = test_data.label.values

In [64]:
len(vectorizer.get_feature_names())

27265

In [65]:
clf = LogisticRegression(C=5,class_weight='balanced', solver='newton-cg',multi_class='multinomial', n_jobs=-1,\
                         random_state=40, verbose=1, penalty='l2')
clf.fit(X_train, Y_train)    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.7min finished


LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l2',
          random_state=40, solver='newton-cg', tol=0.0001, verbose=1,
          warm_start=False)

In [66]:
clf.score(X_train,Y_train)

0.6956005220149465

In [67]:
clf.score(X_test, Y_test)

0.64528

In [68]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75     16898
           1       0.70      0.56      0.62     22350
           2       0.47      0.64      0.54     10752

   micro avg       0.65      0.65      0.65     50000
   macro avg       0.64      0.65      0.64     50000
weighted avg       0.66      0.65      0.65     50000



# 1-gram and 2-gram

In [69]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=0.0001, stop_words='english', ngram_range=(1,2))

In [70]:
X_train = vectorizer.fit_transform([' '.join(post) for post in train_data.text.values])
Y_train = train_data.label.values
X_test = vectorizer.transform([' '.join(post) for post in test_data.text.values])
Y_test = test_data.label.values

In [71]:
len(vectorizer.get_feature_names())

100450

In [72]:
clf = LogisticRegression(C=5,class_weight='balanced', solver='newton-cg',multi_class='multinomial', n_jobs=-1,\
                         random_state=40, verbose=1, penalty='l2')
clf.fit(X_train, Y_train)    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  5.2min finished


LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l2',
          random_state=40, solver='newton-cg', tol=0.0001, verbose=1,
          warm_start=False)

In [73]:
clf.score(X_train,Y_train)

0.7961644776241688

In [74]:
clf.score(X_test, Y_test)

0.65916

In [75]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77     16898
           1       0.69      0.60      0.64     22350
           2       0.48      0.60      0.54     10752

   micro avg       0.66      0.66      0.66     50000
   macro avg       0.64      0.66      0.65     50000
weighted avg       0.67      0.66      0.66     50000

