In [12]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from multiprocessing.dummy import Pool as ThreadPool
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier, AdaBoostClassifier
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from nltk.corpus import stopwords # Import the stop word list
import sys 
from textblob import TextBlob, Word
from nltk.corpus import wordnet as wn
from sklearn.utils import resample
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

pd.set_option('display.max_rows', 100) # to look at more rows of data later
pd.set_option('display.max_columns', 100) # to expand columns view so that all can be seen later

In [2]:
data = pd.read_csv('../dataset/positiveset.csv')

In [3]:
col = ['severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit']
for i in col:
    data[i] = [1 if i >= 0.5 else 0 for i in data[i]]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['comment'],
                                                    data[col],
                                                    test_size=0.25,
                                                    random_state=42,
                                                    )

In [13]:
model_dict = {
    'cvec': CountVectorizer(),
    'tvec': TfidfVectorizer(),
    'lr': LogisticRegression(solver='lbfgs'),
    'knn': KNeighborsClassifier(),
    'nb': MultinomialNB(),
    'rf': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada': AdaBoostClassifier(random_state=42),
    'ovr_lr': OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=300)),
    'clf_lr': ClassifierChain(LogisticRegression(solver='lbfgs',max_iter=300),random_state=42,order='random'),
    'clf_ada': ClassifierChain(AdaBoostClassifier())
}

model_full = {
    'cvec': 'CountVectorizer',
    'tvec': 'TfidfVectorizer',
    'lr': 'Logistic Regression',
    'knn': 'KNearestNeighbor',
    'nb': 'Multinomial NB',
    'dt': 'Decision Tree',
    'rf': 'Random Forest',
    'et': 'Extra Tree',
    'ada': 'AdaBoost',
    'ovr_lr': 'OneVsRest (Logistic Regression)',
    'clf_lr': 'Classifier Chain (Logistic Regression)',
    'clf_ada': 'Classifier Chain (Adaboost)'
}

param_dict = {
    'cvec': {
        'cvec__max_features': [5000,6000,7000],
        'cvec__min_df': [3,4],
        'cvec__max_df': [.9, .95],
        'cvec__ngram_range': [(1,1), (1,2)]
    },
    'tvec': {
        'tvec__max_features': [5000,6000,7000],
        'tvec__min_df': [3,4],
        'tvec__max_df': [.9, .95],
        'tvec__ngram_range': [(1,1), (1,2)]
    },
    'knn': {
        'knn__n_neighbors': [5,6,7,8,9]
    },
    'lr': {},
    'nb': {},
    'dt': {
        'dt__max_depth': [5,7],
        'dt__min_samples_split': [10,15],
        'dt__min_samples_leaf': [3,4]
    },
    'rf': {
        'rf__n_estimators': [100],
        'rf__max_depth': [5,7],
        'rf__min_samples_split': [10,15],
        'rf__min_samples_leaf': [3,4]
        
    },
    'et': {
        'et__n_estimators': [100],
        'et__max_depth': [5,7],
        'et__min_samples_split': [10,15],
        'et__min_samples_leaf': [3,4]
    },
    'ada': {
        'ada__n_estimators': [50,100,200],
        'ada__learning_rate': [0.9, 1]
    },
    'ovr_lr': {},
    'clf_lr': {},
    'clf_ada': {}
}

def prepare_pipeline(list_of_models):
    """
    Prepare pipeline of models to be used for modelling
    
    Parameters
    ----------
    list_of_models: list[str]
        List of models to be included for pipeline
    
    Returns
    -------
    Pipeline
        Pipeline of models to be run
    """
    pipe_list = [(i,model_dict[i]) for i in list_of_models]
    return Pipeline(pipe_list)

def add_params(name,pipe_dict):
    """
    Add parameters for GridSearch
    
    Parameters
    ----------
    name: str
        Name of model/vectorization method to have params added.
    pipe_dict: Dictionary
        Dictionary that contains parameters to be added into GridSearch
    
    Returns
    -------
    Dictionary
        Dictionary that contains parameters to be added for GridSearch
    """
    params = param_dict[name]
    for k,v in params.items():
        pipe_dict[k] = v
    return pipe_dict

def grid_search(vec_method,model,filename,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test):
    """
    Initialize and run GridSearch
    
    Parameters
    ----------
    vec_method: str
        Vectorization method to use. Vectorization method has to be contained in model_dict.
        
    model: str
        Initialize which classification model to use. Note classification model has to be contained in model_dict.
        
    filename: str
        Name of pickle file to save to.
        
    X_train: list[str]
        List of training data to be used
        
    y_train: list[str]
        Target value of the training data
        
    X_test: list[str]
        List of test data to be used 
        
    y_test: list[str]
        Target value of test data
    
    Returns
    -------
    List
        List that contains predicted values of the test data
    """
    pipe_params = {}
    pipe_params = add_params(vec_method,pipe_params)
    pipe_params = add_params(model,pipe_params)
    pipe = prepare_pipeline([vec_method,model])
    gs = GridSearchCV(pipe,param_grid=pipe_params,cv=3,n_jobs=3)
    gs.fit(X_train,y_train)
    print(f'Using {model_full[model]} with {model_full[vec_method]}:')
    print(f'Train Score: {round(gs.best_score_,4)}')
    print(f'Test Score: {round(gs.score(X_test,y_test),4)}')
    print(f'Using the following parameters: {gs.best_params_}')
    # Save model into pickle
    pickle.dump(model, open(filename, 'wb'))
    return gs.best_estimator_.predict(X_test)

In [48]:
ovrlr_cvec_predictions = grid_search('cvec','ovr_lr','cvec_ovrlr.sav')

Using OneVsRest (Logistic Regression) with CountVectorizer:
Train Score: 0.7048
Test Score: 0.7109
Using the following parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 5000, 'cvec__min_df': 4, 'cvec__ngram_range': (1, 2)}


In [49]:
ovrlr_tvec_predictions = grid_search('tvec','ovr_lr','tvec_ovrlr.sav')

Using OneVsRest (Logistic Regression) with TfidfVectorizer:
Train Score: 0.722
Test Score: 0.7276
Using the following parameters: {'tvec__max_df': 0.9, 'tvec__max_features': 6000, 'tvec__min_df': 3, 'tvec__ngram_range': (1, 1)}


In [55]:
clflr_cvec_predictions = grid_search('cvec','clf_lr','cvec_clflr.sav')

Using Classifier Chain (Logistic Regression) with CountVectorizer:
Train Score: 0.7061
Test Score: 0.7128
Using the following parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 5000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}


In [56]:
clflr_tvec_predictions = grid_search('tvec','clf_lr','tvec_clflr.sav')

Using Classifier Chain (Logistic Regression) with TfidfVectorizer:
Train Score: 0.7191
Test Score: 0.7254
Using the following parameters: {'tvec__max_df': 0.9, 'tvec__max_features': 5000, 'tvec__min_df': 4, 'tvec__ngram_range': (1, 1)}


In [14]:
clfada_tvec_predictions = grid_search('tvec','clf_ada','tvec_clfada.sav')

Using Classifier Chain (Adaboost) with TfidfVectorizer:
Train Score: 0.7034
Test Score: 0.7068
Using the following parameters: {'tvec__max_df': 0.9, 'tvec__max_features': 6000, 'tvec__min_df': 3, 'tvec__ngram_range': (1, 2)}


In [15]:
clfada_cvec_predictions = grid_search('cvec','clf_ada','cvec_clfada.sav')

Using Classifier Chain (Adaboost) with CountVectorizer:
Train Score: 0.6931
Test Score: 0.6977
Using the following parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 5000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}
