In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import joblib
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

In [3]:
df = joblib.load('/home/hamza/Documents/sentiments/data/01_processed_data.joblib')
df.shape

(11109, 2)

In [3]:
## Upcasting
def augmentMyData(df, augmenter, label, repetitions=1, samples=200):
    from sklearn.utils import shuffle
    augmented_texts = []
    # select only the minority class samples
    spam_df = df[df['Sentiment'] == label].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(spam_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(spam_df['processed_text'].iloc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'Sentiment': label,
        'processed_text': augmented_texts
    }
    
    aug_df = pd.DataFrame(data)
    dff = shuffle(df.append(aug_df).reset_index(drop=True))
    return dff


In [4]:
import nlpaug.augmenter.word.context_word_embs as aug

augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
aug = augmentMyData(df, augmenter, samples=1000, label=0)
aug_df = augmentMyData(aug, augmenter, samples=1500, label=1)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1500 [00:00<?, ?it/s]

In [5]:
df1 = aug_df[aug_df['Sentiment']==0]
df2 = aug_df[aug_df['Sentiment']==1]
df4 = aug_df[aug_df['Sentiment']==2][:4000]

aug_df = pd.concat([df1, df2,df4])# df3, df4])

aug_df['Sentiment'].value_counts()

2    4000
0    3753
1    3169
Name: Sentiment, dtype: int64

In [7]:
def tfidf(X_train, X_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    # from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

    vectorizer = TfidfVectorizer(max_features=2000)
    X_train = vectorizer.fit_transform(X_train)
    X_train = X_train.toarray()
    print(X_train.shape)

    X_test = vectorizer.transform(X_test)
    X_test = X_test.toarray()
    print(X_test.shape)
    
    return X_train, X_test, vectorizer

In [8]:
def split_data(dfff, ratio):
    Y = dfff['Sentiment'].values
    X = dfff['processed_text']
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    from sklearn.model_selection import train_test_split

    # Random sampling
    return train_test_split(X, Y, test_size=ratio) 

In [9]:
def BestMLAlgo(nlp_model, x_train, x_test, y_train, y_test):
    """Identify best Algo on given dataset"""
    from prettytable import PrettyTable
    
    table = PrettyTable()
    table.field_names = ["Vect", "Model", "Accuracy", "Precision", "Recall", "F1Score", "Log loss", "Roc Auc"]
                        
    import warnings
    warnings.filterwarnings("ignore")
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import SGDClassifier
    from sklearn.calibration import CalibratedClassifierCV
    import xgboost
    from sklearn import svm
    from sklearn.metrics import precision_score, accuracy_score, roc_auc_score,\
                                roc_curve, auc, log_loss, recall_score
    
    base_models = {
        'kNN': KNeighborsClassifier(),
        'Naive Bayes': MultinomialNB(),
        'Log. Reg.': LogisticRegression(),
        'SVM Linear': SGDClassifier(class_weight='balanced', penalty='l2', loss='hinge', random_state=42),
        'SVM Non-linear': svm.SVC(kernel='rbf'),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boost': GradientBoostingClassifier(),
        'Ada Boost': AdaBoostClassifier(),
        'xgboost': xgboost.XGBClassifier(),
    }          
        
    for model_name, model in base_models.items():
        model.fit(x_train, y_train)
        model = CalibratedClassifierCV(model, method="sigmoid")
        model.fit(x_train, y_train)
        y_pred_proba = model.predict_proba(x_test)
        y_pred = model.predict(x_test)
            
        # Performance metrics
        accuracy        = round(accuracy_score(y_test, y_pred), 2)
        precision       = round(precision_score(y_test, y_pred, average='micro'), 2)
        recall          = round(recall_score(y_test, y_pred, average='micro'), 2)
        f1_score        = round((2*recall*precision)/(recall+precision), 2)
        loss            = round(log_loss(y_test, y_pred_proba, eps=1e-15), 2) # , labels=model.classes
        roc_auc         = round(roc_auc_score(y_test, y_pred_proba, multi_class="ovr"), 2)        
        
        table.add_row([nlp_model, model_name, accuracy, precision, recall, f1_score, loss, roc_auc])
    print(table)

In [10]:
X_train, X_test, y_train, y_test = split_data(aug_df, ratio=20)
X_train, X_test, VEC = tfidf(X_train, X_test)


print(X_train.shape, " ", X_test.shape, " ", y_train.shape, " ",y_test.shape)

(10902, 2000)
(20, 2000)
(10902, 2000)   (20, 2000)   (10902,)   (20,)


In [11]:
BestMLAlgo("TFIDF", X_train, X_test, y_train, y_test)

+-------+----------------+----------+-----------+--------+---------+----------+---------+
|  Vect |     Model      | Accuracy | Precision | Recall | F1Score | Log loss | Roc Auc |
+-------+----------------+----------+-----------+--------+---------+----------+---------+
| TFIDF |      kNN       |   0.75   |    0.75   |  0.75  |   0.75  |   0.64   |   0.9   |
| TFIDF |  Naive Bayes   |   0.7    |    0.7    |  0.7   |   0.7   |   0.55   |   0.93  |
| TFIDF |   Log. Reg.    |   0.65   |    0.65   |  0.65  |   0.65  |   0.6    |   0.9   |
| TFIDF |   SVM Linear   |   0.65   |    0.65   |  0.65  |   0.65  |   0.6    |   0.91  |
| TFIDF | SVM Non-linear |   0.75   |    0.75   |  0.75  |   0.75  |   0.62   |   0.91  |
| TFIDF | Decision Tree  |   0.8    |    0.8    |  0.8   |   0.8   |   0.61   |   0.97  |
| TFIDF | Random Forest  |   0.85   |    0.85   |  0.85  |   0.85  |   0.47   |   0.98  |
| TFIDF | Gradient Boost |   0.7    |    0.7    |  0.7   |   0.7   |   0.73   |   0.91  |
| TFIDF | 

In [12]:
joblib.dump(aug_df, "data/oversampled.df")

['data/oversampled.df']

In [13]:
aug_df['Sentiment'].value_counts()

2    4000
0    3753
1    3169
Name: Sentiment, dtype: int64