<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/Multinomial_NB_%26_Logistic_Regression_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import random, re, string 
import pandas as pd
import numpy as np 

from pathlib import Path
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [0]:
file_path = '/content/drive/My Drive'

DATA_PATH = Path(file_path + '/datasets/SemEval')

random.seed(42)

train = pd.read_csv(DATA_PATH/'train.csv')
val = pd.read_csv(DATA_PATH/'val.csv')
test = pd.read_csv(DATA_PATH/'test.csv')

In [94]:
train.head()

Unnamed: 0.1,Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [0]:
train_text = train['Tweet']
val_text = val['Tweet']
test_text = test['Tweet']

In [96]:
train_text.head()

0    “Worry is a down payment on a problem you may ...
1    Whatever you decide to do make sure it makes y...
2    @Max_Kellerman  it also helps that the majorit...
3    Accept the challenges so that you can literall...
4    My roommate: it's okay that we can't spell bec...
Name: Tweet, dtype: object

# Tokenize and clean text using Regular Expressions

In [0]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

In [98]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

## Apply the clean data preprocessor to the text

In [0]:
train_text = train_text.apply(preprocessor)
val_text = val_text.apply(preprocessor)
test_text = test_text.apply(preprocessor)

In [100]:
train_text.head()

0     worry is a down payment on a problem you may ...
1    whatever you decide to do make sure it makes y...
2     max_kellerman it also helps that the majority...
3    accept the challenges so that you can literall...
4    my roommate it s okay that we can t spell beca...
Name: Tweet, dtype: object

## Basic text pre-processing pipeline

In [0]:
stop = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def tokenizer(text):
  return text.split()

def tokenizer_stemmer(text):
  return [stemmer.stem(word) for word in tokenizer(text)]

def stop_removal(text):
  return [w for w in text if not w in stop]

# Classifier training

In [103]:
categories = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 
              'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# NB_pipeline = make_pipeline(
#                         CountVectorizer(),
#                         TfidfTransformer(),
#                         OneVsRestClassifier(MultinomialNB(
#                             fit_prior=True, class_prior=None)))

LR_pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    OneVsRestClassifier(LogisticRegression(random_state=1))
)

for cat in categories:
  print(f'\n... Processing {cat}')

  LR_pipeline.fit(train_text, train[cat])

  # Compute test accuracy
  test_pred = LR_pipeline.predict(test_text)
  print(f'Test roc auc: {roc_auc_score(test[cat], test_pred)}')
  print(f'Test Jaccard: {jaccard_score(test[cat], test_pred)}')
  print(f'Test Micro F1: {f1_score(test[cat], test_pred, average="micro")}')
  print(f'Test Macro F1: {f1_score(test[cat], test_pred, average="macro")}')

  # Compute validation accuracy
  val_pred = LR_pipeline.predict(val_text)
  print(f'\nValidation roc auc: {roc_auc_score(val[cat], val_pred)}')
  print(f'Validation Jaccard: {jaccard_score(val[cat], val_pred)}')
  print(f'Validation Micro F1 is {f1_score(val[cat], val_pred, average="micro")}')
  print(f'Validation Macro F1: {f1_score(val[cat], val_pred, average="macro")}')


... Processing anger
Test roc auc: 0.7004862880572806
Test Jaccard: 0.4140687450039968
Test Micro F1: 0.7750843817121816
Test Macro F1: 0.7156466607887817

Validation roc auc: 0.7037250159842104
Validation Jaccard: 0.4207492795389049
Validation Micro F1 is 0.7731376975169301
Validation Macro F1: 0.7175690313331123

... Processing anticipation
Test roc auc: 0.5033529826891943
Test Jaccard: 0.007042253521126761
Test Micro F1: 0.8702055845351335
Test Macro F1: 0.4722582393792773

Validation roc auc: 0.4973753280839895
Validation Jaccard: 0.0
Validation Micro F1 is 0.8555304740406321
Validation Macro F1: 0.4610705596107056

... Processing disgust
Test roc auc: 0.6809757607926398
Test Jaccard: 0.384080370942813
Test Micro F1: 0.7554464559680883
Test Macro F1: 0.6931956635775072

Validation roc auc: 0.6688781631310368
Validation Jaccard: 0.3692722371967655
Validation Micro F1 is 0.7358916478555305
Validation Macro F1: 0.6771217482308383

... Processing fear
Test roc auc: 0.6060844810798357
