# Import of library

In [None]:
#Importing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, f1_score

# Train, test & split of data

In [None]:
#Importing dataframe
working_df=pd.read_csv('/kaggle/input/project-3/working_df.csv')

In [None]:
#Assigning X and y variables
X=working_df['text_translated']
y=working_df['hateful']

#Splitting the data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, stratify=y)

# Functions to be use (or shld use a python script to import functions)

# Approach 1A: TF-IDF vectorization + Multinomial Naive Bayes 

## Preprocessing & Vectorizing

Rationale: 
- TF-IDF vectorization is selected for the first approach as it assigns heavier weight to less frequent tokens and less weight to more frequent tokens which might provide a better basis for training a more effective model.  

The data will undergo preprocessing with TFIDF vectorizing with:
- the use of standard english stopwords filter
- maximum features of up till 10,000
- maximum appearence of a word across 95% of all documents
- minimum appearence of a word across 1% of all documents

In [None]:
#Instantiate of TF-IDF vectorizer
tvec1A=TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.95, min_df=0.01)

In [None]:
#Fit and transform of data
X_train_tvec1A=tvec1A.fit_transform(X_train)
X_test_tvec1A=tvec1A.transform(X_test)

## Modeling

Rationale:
- Multinomial Naive Bayes is selected due to its relative speed in training and since TF-IDF vectorizer is used which provides fractional counts of the tokens. Despite the unsatisfied assumption of independent tokens, it ususally still provide decent results.

In [None]:
# Instantiate MultinomialNB model
nb1A=MultinomialNB()

In [None]:
#Train of model
model1A=nb1A.fit(X_train_tvec1A, y_train)

In [None]:
#Getting predictions of train and test data
train_pred_model1A=model1A.predict(X_train_tvec1A)
test_pred_model1A=model1A.predict(X_test_tvec1A)

## Evaluation

Given how the dataset is imbalanced, we will explore other metrics beside accuracy score, specifically:

- Recall **(Higher consideration)** 
    - Measures type 1 errorSince we want to ensure that a hateful comment will be successfully targeted whenever it is present.
- Precision **(Lesser considertion)** How accurate the detected hateful comment predictions are)
- F1 score (Balance between Precision and Recall for overall performance)

In [None]:
#Creating and visualising the confusion matrix on predictions from the test data
cm1A=confusion_matrix(y_test,test_pred_model1A)

#Plot confusion matrix
disp1A = ConfusionMatrixDisplay(confusion_matrix=cm1A, display_labels=['Non-hateful', 'Hateful'])
disp1A.plot();

In [None]:
def score_summary(y_true,y_pred):
    print(f'Recall score:{recall_score(y_true, y_pred)}')
    print(f'Precision score:{precision_score(y_true, y_pred)}')
    print(f'F1_score:{f1_score(y_true, y_pred)}')

In [None]:
#Summary of train data scores
score_summary(y_train,train_pred_model1A)

In [None]:
#Summary of test data scores
score_summary(y_test,test_pred_model1A)

Conclusion:
- Very poor recall score

# PIPELINE CONSTRUCTION STRUCTURE

## Approach XX: vectorizer+model

## Preprocessing & Vectorizing

Rationale:

Parameters of preprocess and vectorizing:

In [None]:
#Setting up pipeline of respective Vectorizer and Model
pipeXX = Pipeline([('cvec', CountVectorizer()),('nb', MultinomialNB())])

In [None]:
import numpy as np

In [None]:
#Countvectorize/TFIDF params
'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
'cvec__min_df': [2, 3],
'cvec__max_df': [.9, .95],
'cvec__ngram_range': [(1,1), (1,2)
                          
#Mulitnomial NB params
'nb__alpha': np.linspace[1,10,3]
                      
#Logistic Regression
'lr__C': np.linspace[1,10,3]
'lr__penalty': np.logspace(-3,0,3)                    

In [None]:
pipeXX_params={
    'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]