<a href="https://colab.research.google.com/github/miguel-kjh/Analysis-of-tweets/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import string
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

np.random.seed(777)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_dir = '/content/drive/My Drive/SIANI Master/CI/Práctica/spanish-arilines-tweets-sentiment-analysis'
!ls '/content/drive/My Drive/SIANI Master/CI/Práctica/spanish-arilines-tweets-sentiment-analysis'

baseline.ipynb	      test_data_un.csv	 tweets_public.csv	val_data_ba.csv
sampleSubmission.csv  train_data_ba.csv  tweets_public.xlsx	val_data_un.csv
test_data_ba.csv      train_data_un.csv  tweets_submission.csv


## Modeling

As a baseline a classical approach is tested using a bag of words with a TFidf approach. The models that have been used are: 

- Random Forest
- GaussianNB
- XGBoost


In [None]:
from typing import List

class Classifier:
  def __init__(self, clf, name: str):
    self.classifier = clf
    self.name       = name

  def train(self, Xtrain, Ytrain, Xtest, Ytest):
    self.classifier.fit(Xtrain, Ytrain)

    self.predict = self.classifier.predict(Xtest)
    self.Ytest   = Ytest
    self.acc     = round(accuracy_score(Ytest, self.predict)*100, 4)

  def __lt__(self, other):
      if not isinstance(other, type(self)): return NotImplemented
      return self.acc < other.acc

  def __eq__(self, other):
      if not isinstance(other, type(self)): return NotImplemented
      return self.acc == other.acc

  def __str__(self):
    try:
      return "%s:\nAccurracy: %s\nRecall and Precision:\n%s" %(self.name, self.acc, classification_report(self.Ytest, self.predict))
    except AttributeError:
      raise RuntimeError("Model not trained")

class ClassifierCollection:
  def __init__(self, collection: List[Classifier] = []) -> None:
      self._collection = collection
  
  def add_classifier(self, classifier: Classifier):
      self._collection.append(classifier)
  
  def train(self, Xtrain, Ytrain, Xtest, Ytest):
    for clf in self._collection:
      clf.train(Xtrain, Ytrain, Xtest, Ytest)

  def sort_by_accuracy(self):
    self._collection.sort(
            key = lambda clf: clf.acc
        )
    
  def get_max(self):
    return max(self._collection)

  def get_list_accuracy(self):
    return {
        clf.name: clf.acc for clf in self._collection 
    }
  
  def __str__(self):
    return ('#'*10 + '\n').join([str(clf) for clf in self._collection])

In [None]:
def get_train_and_test_samples(df_train: pd.DataFrame, df_test: pd.DataFrame, vectorizer) -> tuple:
  vectorize = vectorizer(max_features=1000, ngram_range=(1,5))
  Xtrain    = vectorize.fit_transform(df_train['quote']).toarray()
  Xtest     = vectorize.fit_transform(df_test['quote']).toarray()

  encoder   = LabelEncoder()
  Ytrain    = encoder.fit_transform(df_train['score'])
  Ytest     = encoder.fit_transform(df_test['score'])
  return (Xtrain, Ytrain, Xtest, Ytest)

def get_collection_of_RFs(clf_collection: ClassifierCollection):
  for number_trees in range(25,300,25):
    clf_collection.add_classifier(Classifier(
        RandomForestClassifier(n_estimators = number_trees), 
        "Random Forest n_estimators = %i" %(number_trees)))

def get_collection_of_Bayes(clf_collection: ClassifierCollection):
  clf_collection.add_classifier(Classifier( GaussianNB(), "GaussianNB"))

def get_collection_of_XGB(clf_collection: ClassifierCollection):
  clf_collection.add_classifier(Classifier( xgb.XGBClassifier(),   "XGB"))
  clf_collection.add_classifier(Classifier( xgb.XGBRFClassifier(), "XGBRF"))

The experiment is carried out with both balanced and unbalanced data 

### Balanced

In [None]:
df_train = pd.read_csv("%s/train_data_ba.csv" %data_dir)
df_val   = pd.read_csv("%s/val_data_ba.csv" %data_dir)
df_train = pd.concat([df_train, df_val])
df_test  = pd.read_csv("%s/test_data_ba.csv" %data_dir)

Xtrain, Ytrain, Xtest, Ytest = get_train_and_test_samples(df_train, 
                                                        df_test, 
                                                        TfidfVectorizer)

### UnBalanced

In [None]:
df_train = pd.read_csv("%s/train_data_un.csv" %data_dir)
df_val   = pd.read_csv("%s/val_data_un.csv" %data_dir)
df_train = pd.concat([df_train, df_val])
df_test  = pd.read_csv("%s/test_data_un.csv" %data_dir)

Xtrain, Ytrain, Xtest, Ytest = get_train_and_test_samples(df_train, 
                                                        df_test, 
                                                        TfidfVectorizer)

### Experiment


#### Random Forest

In [None]:
clf_collection_tf = ClassifierCollection([])
get_collection_of_RFs(clf_collection_tf)
clf_collection_tf.train(Xtrain, Ytrain, Xtest, Ytest)

In [None]:
print(clf_collection_tf.get_max())

Random Forest n_estimators = 50:
Accurracy: 45.2351
Recall and Precision:
              precision    recall  f1-score   support

           0       0.54      0.60      0.57       375
           1       0.38      0.47      0.42       262
           2       0.14      0.05      0.07       150

    accuracy                           0.45       787
   macro avg       0.36      0.37      0.35       787
weighted avg       0.41      0.45      0.43       787



#### Bayes

In [None]:
clf_collection_tf = ClassifierCollection([])
get_collection_of_Bayes(clf_collection_tf)
clf_collection_tf.train(Xtrain, Ytrain, Xtest, Ytest)

In [None]:
print(clf_collection_tf.get_max())

GaussianNB:
Accurracy: 34.5616
Recall and Precision:
              precision    recall  f1-score   support

           0       0.47      0.41      0.44       375
           1       0.30      0.27      0.28       262
           2       0.22      0.33      0.26       150

    accuracy                           0.35       787
   macro avg       0.33      0.34      0.33       787
weighted avg       0.37      0.35      0.35       787



#### XGBoost

In [None]:
clf_collection_tf = ClassifierCollection([])
get_collection_of_XGB(clf_collection_tf)
clf_collection_tf.train(Xtrain, Ytrain, Xtest, Ytest)

In [None]:
print(clf_collection_tf.get_max())

XGB:
Accurracy: 50.3177
Recall and Precision:
              precision    recall  f1-score   support

           0       0.51      0.94      0.66       375
           1       0.49      0.15      0.23       262
           2       0.27      0.02      0.04       150

    accuracy                           0.50       787
   macro avg       0.42      0.37      0.31       787
weighted avg       0.46      0.50      0.40       787

