In [None]:
"""
Prediction models for balanced and imblaced dataset for token count features

Step1: Creation of the features (1-gram, 2-gram, 3-gram, tf-idf, all)
Step2: Division of data in train and test set
Step3: Perform 10-fold Cross Validation for Imbalanced train set
Step4: Classification for Imbalanced train set (Logistic Regression, CART, Naive Bayes, Linear SVM)
Step5: Balance train set
Step6: Perform 10-fold Cross Validation for Imbalanced train set
Step7: Classification for Balanced train set (Logistic Regression, CART, Naive Bayes, Linear SVM)

"""

In [None]:
import pandas as pd
import re
import pickle as pkl
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyClassifier


#https://pypi.org/project/scikit-plot/
# https://scikit-plot.readthedocs.io/en/stable/metrics.html
import scikitplot as skplt 
import matplotlib.pyplot as plt
from sklearn.utils import resample


# Open files

In [None]:
with open('Preprocessed.pkl', 'rb') as handle:
    Preprocessed = pkl.load(handle)

In [None]:
with open('Numerical_Features.pkl', 'rb') as handle:
    numerical_features = pkl.load(handle)

In [None]:
Features = pd.concat([Preprocessed, numerical_features], axis=1)

# Create features

In [None]:
features_pred= [c for c in Features.columns.values if c  not in ['Label','Emojie','POS']]

numeric_features= [c for c in Features.columns.values if c   in ['Length','Words','Avg_word_length','Punctuation']]
target = 'Label'

In [None]:
#https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
def vectorizer(feature_type,key):
    
    
    unigram =  Pipeline([
                ('selector', TextSelector(key= key)),
                ('unigram', CountVectorizer(ngram_range= (1,1)))
            ])
    bigram =  Pipeline([
                ('selector', TextSelector(key= key)),
                ('bigram', CountVectorizer(ngram_range = (2,2)))
            ])
    
    trigram =  Pipeline([
                ('selector', TextSelector(key= key)),
                ('trigram', CountVectorizer(ngram_range = (3,3)))
            ])
    tfidf =  Pipeline([
                ('selector', TextSelector(key= key)),
                ('tfidf', TfidfVectorizer())
            ])
    
    all_text = FeatureUnion([
            ('unigram', unigram),
            ('bigram', bigram),
            ('trigram',trigram),
            ('tfidf',tfidf)])
    
    all_text = Pipeline([('all_text', all_text)])
    
    if (feature_type == "1gram"):
            
        pipeline = unigram

    elif (feature_type == "2gram"):
        
        pipeline = bigram

        
    elif (feature_type == "3gram"):
        
        pipeline = trigram

    
    elif (feature_type == "tfidf"):
        
        pipeline = tfidf

    
    elif (feature_type == "all_text"):
        
        pipeline = all_text

    print("Vectorizer ",feature_type, " has been created. \n\n")

    return pipeline
    

# PREDICTIONS

In [None]:
def balanced(X_train, Y_train, pipe, balance):
    
    if (balance == 0):
   
        models = []

        models.append(('LR',Pipeline([("pipe",pipe),
            ( 'LR',LogisticRegression(solver='liblinear', multi_class='ovr'))])))
        models.append(('CART', Pipeline([("pipe",pipe),
            ('CART',DecisionTreeClassifier())]))) 
        models.append(('NB', Pipeline([("pipe",pipe),
              ('NB',MultinomialNB())])))
        models.append(('SVM', Pipeline([("pipe",pipe),
            ('SVM',LinearSVC())])))
        
        
    elif (balance == 1):
        models = []

        models.append(('LR',Pipeline([("pipe",pipe),
            ( 'LR',LogisticRegression(solver='liblinear', multi_class='ovr'))])))
        models.append(('SVM', Pipeline([("pipe",pipe),
            ('SVM',LinearSVC())])))
        
        X = pd.concat([X_train, Y_train], axis=1)
        X.head(5)
        
        hate = X[X.Label=="Explicit"]
        no_hate = X[X.Label=="No hate"]
        
        no_hate_upsampled = resample(no_hate,
                          replace=True, 
                          n_samples=len(no_hate), 
                          random_state=27)

        # combine majority and upsampled minority
        upsampled = pd.concat([hate, no_hate_upsampled])
 
        upsampled.Label.value_counts()
        
        Y_train = upsampled.Label
        X_train = upsampled.drop('Label', axis=1)
        
    return([Y_train, X_train, models])

In [None]:
def model_accuracy(pipe, balance):
    
    Y_train = balance[0]
    X_train = balance[1]
    models = balance[2]
    
    results = []
    names = []
    seed = 7
    scoring = 'accuracy'

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg,"\n\n")

In [None]:
def prediction(X_test, Y_test, pipe, balance):
        
    Y_train = balance[0]
    X_train = balance[1]
    models = balance[2]
        
    for name, model in models:

        print("Prediction for ",name)

        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)

        
        print("Accuracy Score \n",accuracy_score(Y_test, predictions),"\n")
        print("Confusion Matrix \n",confusion_matrix(Y_test, predictions),"\n")
        print("Classification Report \n",classification_report(Y_test, predictions),"\n\n")
        print("Confusion matrix plot")
        
        skplt.metrics.plot_confusion_matrix(Y_test, predictions, normalize=False)
        plt.show()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(Features[features_pred], Features[target], test_size=0.33, random_state=42)


for i in range (1):

    if (i==0):
        print("IMBALANCED DATASET ",i)
        features = []
        features.append("1gram")
        features.append("2gram")
        features.append("3gram")
        features.append("tfidf")
        features.append("all_text")

        keys = ['NoLaugh','Stemming','NoPunctuation']
        
       
        
    
    elif (i==1):
        print("BALANCED DATASET ", i)
        features = []
        features.append("1gram")
        features.append("tfidf")
        features.append("all_text")

        keys = ['Stemming']
        

    for key in keys:
        for name in features:
            print ("Column name :", key)

            pipe = vectorizer(name,key)
            balance = balanced(X_train, Y_train, pipe, i)
            
            if (name=="1gram"):

                print("Dummy Here!")

                pipeline_dummy = make_pipeline(pipe,  DummyClassifier(strategy='most_frequent', random_state=0))
                pipeline_dummy.fit(X_train, Y_train)
                predictions = pipeline_dummy.predict(X_test)

                print("Accuracy Score \n",accuracy_score(Y_test, predictions),"\n")
                print("Confusion Matrix \n",confusion_matrix(Y_test, predictions),"\n")
                print("Classification Report \n",classification_report(Y_test, predictions),"\n\n")


            model_accuracy( pipe,balance)

            prediction(X_test, Y_test, pipe, balance)