In [None]:
# Binary Classification of Ethical Concerns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

In [None]:
%store -r df

In [None]:
dataset = df
dataset.head()

In [None]:
df_concern = dataset.query("cat1 not in ['Other', 'none', 'Noise']")
print("number of all reviews:", len(dataset))
print("number of reviews with concerns:", len(df_concern))

In [None]:
dataset["target"] = np.where(dataset["cat1"].isin(['Other', 'none', 'Noise']), 0, 1)
X = dataset['content']
y = dataset['target']

In [None]:
def factorize_concern(dataset):
    dataset['cat1_id'] = dataset['cat1'].factorize()[0]
    concern_id_df = dataset[['cat1', 'cat1_id']].drop_duplicates().sort_values('cat1_id')
    concern_to_id = dict(concern_id_df.values)
    id_to_concern = dict(concern_id_df[['cat1_id', 'cat1']].values)
    return dataset, concern_id_df, concern_to_id, id_to_concern

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, dataset.index, test_size=0.75, random_state=42)


In [None]:
len(y_test)

In [None]:
dataset

In [None]:
# Beginning with Default Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# Setting up fidf X data
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [None]:
models = {}

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
models['Naive Bayes'] = MultinomialNB(alpha=0.4, fit_prior=False)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# MLP Classifier
from sklearn.neural_network import MLPClassifier
models['MLP']  = MLPClassifier(
                        hidden_layer_sizes=(15,),
                        random_state=5,
#                         max_iter=3000,
                        alpha=0.0008
                       )

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy, f1score, precision, recall = {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier
    models[key].fit(X_train_tfidf, y_train)
    
    # Make predictions
    predictions = models[key].predict(X_test_tfidf)
    
    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
    f1score[key] = f1_score(predictions, y_test)

In [None]:
# With all metrics stored, we can use pandas to view the data as a table:

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'F1-score'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['F1-score'] = f1score.values()
df_model.to_csv('binary_classification_results.csv')
df_model

In [None]:
rf = models['Naive Bayes']
preds = rf.predict(X_test_tfidf)
preds

In [None]:
def add_predict(df, preds, indices_test):
    for ind, pred in zip(indices_test, preds):
        df.at[ind, "predicted_bc"] = int(pred)
    return df

In [None]:
dataset_pred_bc = add_predict(dataset, preds, indices_test)

In [None]:
dataset_pred_bc = dataset_pred_bc[pd.notnull(dataset_pred_bc['predicted_bc'])]
dataset_pred_bc['predicted_bc'].unique()

In [None]:
dataset_pred_bc = dataset_pred_bc[dataset_pred_bc['predicted_bc']==1]
dataset_pred_bc['cat1'].unique()

In [None]:
%store dataset_pred_bc

In [None]:
df_count = dataset_pred_bc.groupby(['cat1', 'target']).clean_content.count().reset_index(name='counts')
df_count

In [None]:
print(len(dataset_pred_bc[dataset_pred_bc['predicted_bc']==1]))
print(len(X_test_tfidf))