# Important!

In [1]:
# Following this tutorial: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
# TODO: This tutorial has enhancements: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import lxml

In [2]:
# read data
df = pd.read_csv("raw_data/fulltrain.csv", names=["labels", "text"])
test_df = pd.read_csv("raw_data/balancedtest.csv", names=["labels", "text"])
df.head()

Unnamed: 0,labels,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [3]:

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text).replace(r'http[\w:/\.]+','') # removing urls
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['text'] = df['text'].apply(clean_text)
test_df["text"] = test_df['text'].apply(clean_text)

In [4]:
# create x and y
X = df["text"]
y = df["labels"]

# test x and y
X_test = test_df['text']
y_test = test_df['labels']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 42)

In [6]:
my_tags = ["trusted", "satire", "hoax", "propaganda"]

In [8]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
print("VALIDATION SET")
y_pred = nb.predict(X_val)

print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred,target_names=my_tags))


print("TEST SET")
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=my_tags))

VALIDATION SET
accuracy 0.6306884082690865
              precision    recall  f1-score   support

     trusted       0.85      0.65      0.74      4244
      satire       0.99      0.09      0.16      2065
        hoax       0.52      1.00      0.68      5313
  propaganda       1.00      0.34      0.50      3035

    accuracy                           0.63     14657
   macro avg       0.84      0.52      0.52     14657
weighted avg       0.78      0.63      0.59     14657

TEST SET
accuracy 0.362
              precision    recall  f1-score   support

     trusted       0.64      0.29      0.40       750
      satire       0.75      0.00      0.01       750
        hoax       0.29      1.00      0.46       750
  propaganda       1.00      0.16      0.27       750

    accuracy                           0.36      3000
   macro avg       0.67      0.36      0.28      3000
weighted avg       0.67      0.36      0.28      3000



In [9]:
# SVM

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

print("VALIDATION SET")
y_pred = sgd.predict(X_val)

print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred,target_names=my_tags))


print("TEST SET")
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

VALIDATION SET
accuracy 0.8894043801596507
              precision    recall  f1-score   support

     trusted       0.82      0.95      0.88      4244
      satire       0.95      0.86      0.90      2065
        hoax       0.89      0.95      0.92      5313
  propaganda       0.97      0.71      0.82      3035

    accuracy                           0.89     14657
   macro avg       0.91      0.87      0.88     14657
weighted avg       0.90      0.89      0.89     14657

TEST SET
accuracy 0.6766666666666666
              precision    recall  f1-score   support

     trusted       0.70      0.77      0.73       750
      satire       0.72      0.35      0.47       750
        hoax       0.55      0.95      0.70       750
  propaganda       0.90      0.65      0.75       750

    accuracy                           0.68      3000
   macro avg       0.72      0.68      0.66      3000
weighted avg       0.72      0.68      0.66      3000



In [11]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(random_state=0, max_iter=10000)),
               ])
logreg.fit(X_train, y_train)

print("VALIDATION SET")
y_pred = logreg.predict(X_val)

print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred,target_names=my_tags))


print("TEST SET")
y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

VALIDATION SET
accuracy 0.9502626731254691
              precision    recall  f1-score   support

     trusted       0.93      0.97      0.95      4244
      satire       0.97      0.92      0.95      2065
        hoax       0.95      0.98      0.96      5313
  propaganda       0.97      0.90      0.93      3035

    accuracy                           0.95     14657
   macro avg       0.95      0.94      0.95     14657
weighted avg       0.95      0.95      0.95     14657

TEST SET
accuracy 0.7383333333333333
              precision    recall  f1-score   support

     trusted       0.81      0.80      0.81       750
      satire       0.82      0.41      0.54       750
        hoax       0.60      0.88      0.71       750
  propaganda       0.83      0.87      0.85       750

    accuracy                           0.74      3000
   macro avg       0.76      0.74      0.73      3000
weighted avg       0.76      0.74      0.73      3000



# Exploring Word2Vec with Logistic Regression

In [13]:

# load word vectors 
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("word_vec/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

  wv.init_sims(replace=True)


In [19]:

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.key_to_index:
            mean.append(wv.syn0norm[wv.key_to_index[word]])
            all_words.add(wv.key_to_index[word])

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [20]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, val = train_test_split(df, test_size=0.3, random_state = 42)

val_tokenized = val.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_val_word_average = word_averaging_list(wv,val_tokenized)

AttributeError: 'KeyedVectors' object has no attribute 'wv'

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=0, max_iter=10000)
logreg = logreg.fit(X_train_word_average, train['labels'])
y_pred = logreg.predict(X_val_word_average)
print('accuracy %s' % accuracy_score(y_pred, val.labels))
print(classification_report(val.labels, y_pred,target_names=my_tags))

Seems like it was pretty bad... in that case, we shall try...

# DocVec + Logistic Regression

to be continued...