# HW 1

Name: Karan Patel

## Read and split data into sets

Once the headlines are read, they are split into test, validation and test sets. 

In [45]:
import json
import numpy as np
from sklearn.model_selection import train_test_split

raw_data = [eval(line) for line in open ('./drive/MyDrive/Sarcasm_Headlines_Dataset.json', 'r')]
print('Number of records found = ', len(raw_data))

Number of records found =  28619


In [46]:
headlines = np.array([doc['headline'] for doc in raw_data])
labels = np.array([doc['is_sarcastic'] for doc in raw_data])

train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(headlines, labels, test_size=1 - train_ratio, stratify=labels)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), stratify=y_test) 

print('Headlines in training set = {}, validation set = {}, test set = {}'.format(len(x_train), len(x_val), len(x_test)))

Headlines in training set = 22895, validation set = 2862, test set = 2862


## Extract Features from Data

Below, the following three transformers are defined which will assist with extracting various features from the headlines in the data:
* `NumOfWordsTransformer`: Return word count for a given headline
* `RepeatingPunctuationsTransformer`: Return counts for repeating punctuations in a given headline. Example: `!!`, `???`
* `EmoFeaturesTransformer`: Following features are computed by this transformer:
  * Return a set of features corresponding to the number of words found in the headline that fall under a given "emotion" bucket. 
  * Optionally return features that correspond to the emotional intensity of words in headline that fall under a given "emotion" bucket. 



In [55]:
import re
import csv
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

class NumOfWordsTransformer(BaseEstimator, TransformerMixin):
  """
  return word count for a given headline
  """
  def __init__(self):
    pass

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):
    word_counter = lambda input: len(re.findall(r'\w+', input))
    return [[feature] for feature in list(map(word_counter, X))]

class RepeatingPunctuationsTransformer(BaseEstimator, TransformerMixin):
  """
  return counts for repeating punctuations in a given headline. Example: !!, ???
  """

  def __init__(self):
    pass

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):
    repeating_punctuation_counter = lambda input: len(re.findall(r'(([^\w\s])\2+)', input))
    return [[feature] for feature in list(map(repeating_punctuation_counter, X))]  

class EmoFeaturesTransformer(BaseEstimator, TransformerMixin):
  """
  Return a set of features corresponding to the number of words found in the headline that fall under a given "emotion" bucket.
  Optionally return features that correspond to the emotional intensity of words in headline that fall under a given "emotion" bucket.
  """

  EMOTIONS = 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust'

  def __init__(self, add_intensity=True):
    self._emo_lexicon = pd.read_csv('./drive/MyDrive/NRC-Emotion-Intensity-Lexicon-v1.txt', sep='\t')
    self._add_intensity = add_intensity

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):
    return [features for features in list(map(self._get_features, X))]  

  def _get_intensity(self, emo_word, emotion):
    """
    get intensity for a given word belonging to a given emotion category
    """

    word_match = self._emo_lexicon['word'] == emo_word
    emotion_match = self._emo_lexicon['emotion'] == emotion

    return self._emo_lexicon[word_match & emotion_match]['emotion-intensity-score'].iloc[0]

  def _get_features(self, input):
    feature_list = []
    input_tokens = input.lower().split()

    for emotion in self.EMOTIONS:
      word_count = 0
      intensity = 0

      emo_words = set(self._emo_lexicon.loc[self._emo_lexicon['emotion'] == emotion]['word'])
      for emo_word in emo_words:
          if emo_word in input_tokens:
            word_count += 1
            intensity += self._get_intensity(emo_word, emotion)

      # add emo word count
      feature_list.append(word_count)
      # optionally add emo intensity
      if self._add_intensity:
        if word_count != 0:
          feature_list.append(intensity / word_count)
        else:
          feature_list.append(0)

    return feature_list

## Naive Bayes Classification Model

### N-Gram features

The model below uses a combination of unigrams, bigrams and trigrams as features. Note that the best parameters based on parameter selection testing are already hard coded into the model below.

In [48]:
from sklearn import metrics
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB(alpha=0.01)),
])

text_clf_pipeline.fit(x_train, y_train)

predicted = text_clf_pipeline.predict(x_test)
print('Test accuracy = ', np.mean(predicted == y_test))

print('\nF-score:\n', metrics.classification_report(y_test, predicted))

scores = cross_val_score(text_clf_pipeline, x_val, y_val, cv=10, scoring='f1_macro')
print('10-fold cross-validation scores = ', scores)
print('Average 10-fold cross-validation score = ', sum(scores) / len(scores))

Test accuracy =  0.8399720475192173

F-score:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1498
           1       0.83      0.83      0.83      1364

    accuracy                           0.84      2862
   macro avg       0.84      0.84      0.84      2862
weighted avg       0.84      0.84      0.84      2862

10-fold cross-validation scores =  [0.75840867 0.71044987 0.7714953  0.72183151 0.69878697 0.74309375
 0.72533859 0.76393047 0.72255173 0.71341622]
Average 10-fold cross-validation score =  0.7329303067323467


For the model above, best parameters were selected using `GridSearchCV` as shown below.

__Note__ : For the other three models, I would not be adding code that I used to select best parameters like I did below for the first model. I'm doing so in the interest of time it takes for the code to run and output the result. I'll be talking more about the parameter selection in the written report.

In [49]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf_pipeline, parameters, cv=10, n_jobs=-1, scoring='f1_macro')

gs_clf = gs_clf.fit(x_train, y_train)
print('Best score: ', gs_clf.best_score_)

print('\nBest parameters:\n')
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best score:  0.8401587748236894

Best parameters:

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 2)


### N-Grams + at-least 3 other features model

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

union = FeatureUnion([('vect', CountVectorizer(ngram_range=(1, 2))),
                      ("num-words", NumOfWordsTransformer()),
                      ("repeating-punctuations", RepeatingPunctuationsTransformer()),
                      ("emo-features-transformer", EmoFeaturesTransformer(add_intensity=True))
                    ])

text_clf_pipeline = Pipeline([
    ('union', union),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB(alpha=0.01)),
])

text_clf_pipeline.fit(x_train, y_train)

predicted = text_clf_pipeline.predict(x_test)
print('Test accuracy = ', np.mean(predicted == y_test))

print('\nF-score:\n', metrics.classification_report(y_test, predicted))

scores = cross_val_score(text_clf_pipeline, x_val, y_val, cv=10, scoring='f1_macro')
print('10-fold cross-validation scores = ', scores)
print('Average 10-fold cross-validation score = ', sum(scores) / len(scores))

Test accuracy =  0.8483577917540182

F-score:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1498
           1       0.87      0.80      0.83      1364

    accuracy                           0.85      2862
   macro avg       0.85      0.85      0.85      2862
weighted avg       0.85      0.85      0.85      2862





10-fold cross-validation scores =  [0.74949905 0.71912312 0.76174668 0.72444219 0.70173521 0.75084628
 0.73377518 0.76973784 0.71341622 0.72003012]
Average 10-fold cross-validation score =  0.7344351869126358


## SVM Classification Model

### N-Gram features

In [53]:
from sklearn.svm import LinearSVC

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(loss='squared_hinge')),
])

text_clf_pipeline.fit(x_train, y_train)

predicted = text_clf_pipeline.predict(x_test)
print('Test accuracy = ', np.mean(predicted == y_test))

print('\nF-score:\n', metrics.classification_report(y_test, predicted))

scores = cross_val_score(text_clf_pipeline, x_val, y_val, cv=10, scoring='f1_macro')
print('10-fold cross-validation scores = ', scores)
print('Average 10-fold cross-validation score = ', sum(scores) / len(scores))

Test accuracy =  0.8626834381551363

F-score:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1498
           1       0.85      0.87      0.86      1364

    accuracy                           0.86      2862
   macro avg       0.86      0.86      0.86      2862
weighted avg       0.86      0.86      0.86      2862

10-fold cross-validation scores =  [0.77192425 0.7839695  0.78215146 0.79916473 0.76850625 0.81810176
 0.76129222 0.8064405  0.79719288 0.76469577]
Average 10-fold cross-validation score =  0.7853439320589615


### N-Grams + at-least 3 other features model

In [56]:
from sklearn.svm import LinearSVC

union = FeatureUnion([("vect", CountVectorizer(analyzer='word', ngram_range=(1, 2))),
                      ("repeating-punctuations", RepeatingPunctuationsTransformer()),
                      ("emo-features-transformer", EmoFeaturesTransformer(add_intensity=True))
                    ])

text_clf_pipeline = Pipeline([
    ('union', union),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(loss='squared_hinge')),
])

text_clf_pipeline.fit(x_train, y_train)

predicted = text_clf_pipeline.predict(x_test)
print('Test accuracy = ', np.mean(predicted == y_test))

print('\nF-score:\n', metrics.classification_report(y_test, predicted))

scores = cross_val_score(text_clf_pipeline, x_val, y_val, cv=10, scoring='f1_macro')
print('10-fold cross-validation scores = ', scores)
print('Average 10-fold cross-validation score = ', sum(scores) / len(scores))

Test accuracy =  0.8654786862334032

F-score:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1498
           1       0.85      0.87      0.86      1364

    accuracy                           0.87      2862
   macro avg       0.87      0.87      0.87      2862
weighted avg       0.87      0.87      0.87      2862





10-fold cross-validation scores =  [0.82573108 0.78745645 0.77225673 0.79684515 0.75378788 0.77969604
 0.78555713 0.8216587  0.80069686 0.82517483]
Average 10-fold cross-validation score =  0.7948860857806316


## References

Libraries used:
* pandas
* numpy
* scikit-learn