<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/Baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [0]:
import random, re, string 
import pandas as pd
import numpy as np 
import gc

from pathlib import Path
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, precision_score, recall_score, jaccard_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import multilabel_confusion_matrix, classification_report

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy import sparse

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [0]:
file_path = '/content/drive/My Drive'

DATA_PATH = Path(file_path + '/datasets/SemEval')

random.seed(42)

train = pd.read_csv(DATA_PATH/'train.csv')
val = pd.read_csv(DATA_PATH/'val.csv')
test = pd.read_csv(DATA_PATH/'test.csv')

In [0]:
train.head()

Unnamed: 0.1,Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [0]:
train_text = train['Tweet']
val_text = val['Tweet']
test_text = test['Tweet']

In [0]:
train_text.head()

0    “Worry is a down payment on a problem you may ...
1    Whatever you decide to do make sure it makes y...
2    @Max_Kellerman  it also helps that the majorit...
3    Accept the challenges so that you can literall...
4    My roommate: it's okay that we can't spell bec...
Name: Tweet, dtype: object

# Tokenize and clean text using Regular Expressions

In [0]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

In [0]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [0]:
for i in range(200, 220):
  print('\n' + train_text[i])


We're in contained depression. Only new economic engine is #Sustainability, say Joel @makower + @markmykleby @ #VERGEcon. #NewGrandStrategy

Solid starts by #Bozzelli and #BenEvans. Hoping for a good #start !

@DepressedStride &gt; huff louder

@theclobra lol I thought maybe, couldn't decide if there was levity or not

There's a certain hilarity in people angry at protests against the national anthem or the flag when these acts are covered by 1st Amendment.

@Cherie_Fitz it's being extremely playful

@BarackObama I love Lizzy Warren's latest rage against Wall Street.

I don't want the pity of my instructors but I'd like some understanding. I'm truly trying despite ALL circumstances that make me discouraged

CommunitySleepCoach: Look at these #narcoleptic #puppies. Make you #smile. If you are human &amp; find yourself in such odd positions, seek #th…

*Sigh* #depression #saddness #afterellen #shitsucks

@Evan_McMullin He's spent his campaign dividing people up and pitting them against 

## Apply the clean data preprocessor to the text

In [0]:
train_text = train_text.apply(preprocessor)
val_text = val_text.apply(preprocessor)
test_text = test_text.apply(preprocessor)

In [0]:
train_text.head()

0     worry is a down payment on a problem you may ...
1    whatever you decide to do make sure it makes y...
2     max_kellerman it also helps that the majority...
3    accept the challenges so that you can literall...
4    my roommate it s okay that we can t spell beca...
Name: Tweet, dtype: object

# TfIdf Vectorization

In [0]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                          stop_words='english', ngram_range=(1,4))

X_dtm = vect_word.fit_transform(train_text)
X_dtm

<6838x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 77449 stored elements in Compressed Sparse Row format>

In [0]:
test_X_dtm = vect_word.transform(test_text)
test_X_dtm

<3259x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 19341 stored elements in Compressed Sparse Row format>

# SVM Classifier

In [0]:
LABEL_COLS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
              'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

classifier = OneVsRestClassifier(LinearSVC())

preds_list = []
labels_list = []

for label in LABEL_COLS:
  print('\n... Processing {}'.format(label))

  # train the model using X_dtm & y
  y = train[label]
  ty = test[label]

  labels_list.append(ty)

  classifier.fit(X_dtm, y)
  y_pred_X = classifier.predict(X_dtm)

  # make predictions on test
  test_y = classifier.predict(test_X_dtm)
  preds_list.append(test_y)

  print(f'F1 Score: {f1_score(ty, test_y)}')


... Processing anger
F1 Score: 0.6395348837209301

... Processing anticipation
F1 Score: 0.14776632302405499

... Processing disgust
F1 Score: 0.5988593155893537

... Processing fear
F1 Score: 0.6481257557436518

... Processing joy
F1 Score: 0.7530910453353317

... Processing love
F1 Score: 0.48964677222898906

... Processing optimism
F1 Score: 0.5736359246740704

... Processing pessimism
F1 Score: 0.22656250000000003

... Processing sadness
F1 Score: 0.5653710247349824

... Processing surprise
F1 Score: 0.12000000000000001

... Processing trust
F1 Score: 0.023809523809523808


In [0]:
np.vstack(labels_list)

array([[1, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [0]:
np.vstack(preds_list)

array([[0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
print(f1_score(labels_list, preds_list, average='micro'))
print(f1_score(labels_list, preds_list, average='macro'))
print(jaccard_score(labels_list, preds_list, average='samples'))

0.5764071157771945
0.5325910664971726
0.30770410904927636


  average, "true nor predicted", 'F-score is', len(true_sum)


# Logistic Regression Classifier

In [0]:
LABEL_COLS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
              'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

classifier = OneVsRestClassifier(LogisticRegression(solver='sag', n_jobs=1))

preds_list = []
labels_list = []

for label in LABEL_COLS:
  print('\n... Processing {}'.format(label))

  # train the model using X_dtm & y
  y = train[label]
  ty = test[label]

  labels_list.append(ty)

  classifier.fit(X_dtm, y)
  y_pred_X = classifier.predict(X_dtm)

  # make predictions on test
  test_y = classifier.predict(test_X_dtm)
  preds_list.append(test_y)

  print(f'F1 Score: {f1_score(ty, test_y)}')


... Processing anger
F1 Score: 0.6

... Processing anticipation
F1 Score: 0.013921113689095127

... Processing disgust
F1 Score: 0.5609756097560976

... Processing fear
F1 Score: 0.4413145539906104

... Processing joy
F1 Score: 0.695578947368421

... Processing love
F1 Score: 0.332824427480916

... Processing optimism
F1 Score: 0.4970828471411902

... Processing pessimism
F1 Score: 0.021052631578947368

... Processing sadness
F1 Score: 0.42533229085222835

... Processing surprise
F1 Score: 0.06818181818181818

... Processing trust
F1 Score: 0.0


In [0]:
print(f1_score(labels_list, preds_list, average='micro'))
print(f1_score(labels_list, preds_list, average='macro'))
print(jaccard_score(labels_list, preds_list, average='samples'))

0.4965747409098894
0.42153815806776834
0.2262010692159819


  average, "true nor predicted", 'F-score is', len(true_sum)
