In [None]:
# Multi-Class Classification Models

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [None]:
%store -r dataset_pred_bc

In [None]:
dataset = dataset_pred_bc
dataset

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)

dataset.groupby('cat1').clean_content.count().sort_values(ascending=True).plot.barh(ylim=0, color='#1f77b4', ax=ax)
#plt.title('No. reviews of ethical concerns')
plt.xlabel('Number of reviews')
plt.ylabel('Ethical concern')

plt.show()

In [None]:
dataset.groupby('cat1').clean_content.count()

In [None]:
def clean_no_concern(dataset):
    df = dataset[pd.notnull(dataset['clean_content'])]
    df = df.query("cat1 not in ['Other', 'none', 'Noise']")
    # get a function here to define top
    df_count = df.groupby('cat1').clean_content.count().reset_index(name='counts')
    top_list = df_count[df_count['counts'] > 50]['cat1']
    top_list = top_list.to_list()
    dataset = dataset.query("cat1  in @top_list")
    print(dataset.groupby('cat1').clean_content.count())
    return dataset

In [None]:
dataset = clean_no_concern(dataset)

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)

dataset.groupby('cat1').clean_content.count().sort_values(ascending=True).plot.barh(ylim=0, color='gray', edgecolor='black', ax=ax)
plt.title('No. reviews of ethical concerns')
plt.xlabel('no. reviews')
plt.ylabel('ethical concerns')

plt.show()

In [None]:
def factorize_concern(dataset):
    dataset['cat1_id'] = dataset['cat1'].factorize()[0]
    concern_id_df = dataset[['cat1', 'cat1_id']].drop_duplicates().sort_values('cat1_id')
    concern_to_id = dict(concern_id_df.values)
    id_to_concern = dict(concern_id_df[['cat1_id', 'cat1']].values)
    return dataset, concern_id_df, concern_to_id, id_to_concern

In [None]:
dataset, concern_id_df, concern_to_id, id_to_concern = factorize_concern(dataset)

In [None]:
concern_to_id

In [None]:
def vectorizer(dataset):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
    features = tfidf.fit_transform(dataset.clean_content).toarray()
    labels = dataset.cat1_id
    print("features: ", features.shape)
    return  features, labels

In [None]:
features, labels = vectorizer(dataset)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_val_score, cross_validate

def init_models():
    rf = RandomForestClassifier(random_state=1,
                              bootstrap=False,
                              max_depth=150,
                              max_features='log2',
                              min_samples_leaf=1,
                              min_samples_split=0.0015,
#                               min_samples_split=0.475,
                              n_estimators=200)


    svm = SVC(C=12, kernel='linear', random_state=5)

    nb = MultinomialNB(alpha=0.4, fit_prior=False)

    lr = LogisticRegression(
                #             penalty='elasticnet',
#                                 warm_start=True,
                            solver='lbfgs',
                            C=23,
                            random_state=2,
                            max_iter=2500
    )

    mlp = MLPClassifier(
                        hidden_layer_sizes=(15,),
                        random_state=5,
#                         max_iter=3000,
                        alpha=0.0008
                       )


    return [rf, svm, nb, lr, mlp]

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import check_X_y
import time
from statistics import mean


def run_models(X, y):
    X, y = check_X_y(X, y)
    skf = StratifiedKFold(n_splits=10, shuffle=True)

    f1_test, rec_test, prec_test, accuracy_test = [[] for _ in range(5)], [[] for _ in range(5)], [[] for _ in range(5)], [[] for _ in range(5)]
    f1_train, rec_train, prec_train, accuracy_train = [[] for _ in range(5)], [[] for _ in range(5)], [[] for _ in range(5)], [[] for _ in range(5)]

    start_time = time.time()
    models = init_models()

    for j, m in enumerate(models):
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            m.fit(X_train, y_train)

            y_pred = m.predict(X_test)
            y_pred_train = m.predict(X_train)
            y_pred_lab = np.unique(y_pred)
            y_train_lab = np.unique(y_train)

            print("calculating scores fold: " + str(i) + "...")
            accuracy_test[j].append(accuracy_score(y_test, y_pred))
            accuracy_train[j].append(accuracy_score(y_train, y_pred_train))

            f1_test[j].append(f1_score(y_test, y_pred, labels=y_pred_lab, zero_division=0, average='macro'))
            f1_train[j].append(f1_score(y_train, y_pred_train, labels=y_train_lab, zero_division=0, average='macro'))

            rec_test[j].append(recall_score(y_test, y_pred, labels=y_pred_lab, zero_division=0, average='macro'))
            rec_train[j].append(recall_score(y_train, y_pred_train, labels=y_train_lab, zero_division=0, average='macro'))

            prec_test[j].append(precision_score(y_test, y_pred, labels=y_pred_lab, zero_division=0, average='macro'))
            prec_train[j].append(precision_score(y_train, y_pred_train, labels=y_train_lab, zero_division=0, average='macro'))

            print("preparing for next fold...\n")

        print("\npreparing for next classifier...")
        print("--- %s seconds ---" % (time.time() - start_time))
    df = pd.DataFrame({"classifier": ["RF", "SVM", "MNB", "LR", "MLP"],
                           "test_mean_accuracy": [[mean(acc)] for acc in accuracy_test],
                           "test_mean_f1": [[mean(f1)] for f1 in f1_test],
                           "test_mean_recall": [[mean(rec)] for rec in rec_test],
                           "test_mean_precision": [[mean(prec)] for prec in prec_test],
                           "train_mean_accuracy": [[mean(acc)] for acc in accuracy_train],
                           "train_mean_f1": [[mean(f1)] for f1 in f1_train],
                           "train_mean_recall": [[mean(rec)] for rec in rec_train],
                           "train_mean_precision": [[mean(prec)] for prec in prec_train],
                           "running time (sec)": [time.time() - start_time]*5
                          })
    df.to_csv("multi_final.csv")
    print("Results saved to multi_final.csv")
    print(df)
    return df


In [None]:
run_models(features, labels)