Importing libraries

In [None]:
# Importing libraries
import pandas as pd
import sklearn
import numpy  as np
import string
import re

# model libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

#NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import spacy
from spacy.symbols import nsubj, VERB
nlp = spacy.load("nl_core_news_sm")
#from spacy.lang.nl.stop_words import STOP_WORDS

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
lemmatizer = nltk.stem.WordNetLemmatizer()
import seaborn as sns

In [None]:
# Load dataset for full_data
df = pd.read_csv("classification_cleaned_data.csv")

# Loading manual labbeled datasets
df_test_q = pd.read_csv("question_training_dataset.csv")
df_test_c = pd.read_csv("concern_training_dataset.csv")
df_test_d = pd.read_csv("doubt_training_dataset.csv")

In [None]:
# Splitting the dataset into training and remaining data
train_data, remaining_data = train_test_split(df, test_size=0.4, random_state=42)  # 60% training, 40% remaining
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)  # 20% validation, 20% test

In [None]:
# Making sure that the datatype is string
train_data['clean_text'] = train_data['clean_text'].astype(str)
test_data['clean_text'] = test_data['clean_text'].astype(str)
val_data['clean_text'] = val_data['clean_text'].astype(str)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.4, norm='l2')

# Fit the vectorizer on the training data and transform the training data
X_train = vectorizer.fit_transform(train_data['clean_text'])
y_train_q = train_data['is_question'].astype(int)
y_train_c = train_data['is_concern'].astype(int)
y_train_d = train_data['is_doubt'].astype(int)

# Transform the validation and test data
X_val = vectorizer.transform(val_data['clean_text'])
y_val_q = val_data['is_question'].astype(int)
y_val_c = val_data['is_concern'].astype(int)
y_val_d = val_data['is_doubt'].astype(int)

# Will not be used, since manual labelled dataset will be used as test set
X_test = vectorizer.transform(test_data['clean_text'])

# This is for the prints of the False positives and negatives
X_test_original_q = df_test_q['clean_text']
X_test_original_c = df_test_c['clean_text']
X_test_original_d = df_test_d['clean_text']

y_test_q = test_data['is_question'].astype(int)
y_test_c = test_data['is_concern'].astype(int)
y_test_d = test_data['is_doubt'].astype(int)

X_test_manual_q = vectorizer.transform(df_test_q['clean_text'])
X_test_manual_c = vectorizer.transform(df_test_c['clean_text'])
X_test_manual_d = vectorizer.transform(df_test_d['clean_text'])

y_test_manual_q = df_test_q['is_question'].astype(int)
y_test_manual_c = df_test_c['is_concern'].astype(int)
y_test_manual_d = df_test_d['is_doubt'].astype(int)

# Checking the size of the train and validation set
print("Training Set Size:", X_train.shape)
print("Validation Set Size:", X_val.shape)

Oversampling

In [None]:
smote = SMOTE(random_state=42)
X_train_q_res, y_train_q_res = smote.fit_resample(X_train, y_train_q)
X_train_c_res, y_train_c_res = smote.fit_resample(X_train, y_train_c)
X_train_d_res, y_train_d_res = smote.fit_resample(X_train, y_train_d)

# Convert the result to a pandas Series to use value_counts()
y_train_q_res_series = pd.Series(y_train_q_res)
y_train_c_res_series = pd.Series(y_train_c_res)
y_train_d_res_series = pd.Series(y_train_d_res)

# Count the occurrences of each class for question
class_counts_q = y_train_q_res_series.value_counts()
print(class_counts_q)

# Count the occurrences of each class for concern
class_counts_c = y_train_c_res_series.value_counts()
print(class_counts_c)

# Count the occurrences of each class for doubt
class_counts_d = y_train_d_res_series.value_counts()
print(class_counts_d)


In [None]:
def grid_search_and_evaluate_2(X_train, y_train, X_test, y_test, target_name):
    # Defining the parameter grid for grid search
    param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}

    # Setting up the grid search with cross-validation
    grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='f1_macro')

    # Fit the grid search on the resampled training data
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    print(f"Best parameters for {target_name}: ", grid_search.best_params_)

    # Train the best model
    best_nb_model = grid_search.best_estimator_
    best_nb_model.fit(X_train, y_train)

    # Perform cross-validation on the resampled training data
    cv_scores = cross_val_score(best_nb_model, X_train, y_train, cv=10, scoring='f1_macro')
    print(f"Cross-validation scores for {target_name}: {cv_scores}")
    print(f"Mean cross-validation score for {target_name}: {np.mean(cv_scores)}")

    # Predict and evaluate on the test set
    y_preds = best_nb_model.predict(X_test)
    print(f"Classification report for {target_name}:")
    print(classification_report(y_test, y_preds, digits=4))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['False', 'True'], yticklabels=['False', 'True'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.show()

    # Identify false positives and false negatives
    false_positives = np.where((y_preds == 1) & (y_test == 0))[0]
    false_negatives = np.where((y_preds == 0) & (y_test == 1))[0]

    # Change the list into question, concern and doubt
    X_test_list = X_test_original_q.tolist()
    # X_test_list = X_test_original_c.tolist()
    # X_test_list = X_test_original_d.tolist()

    # Making a df for identifying False positives and negatives
    df_test = pd.DataFrame({'clean_text': X_test_list, 'actual': y_test, 'predicted': y_preds})

    # Extract false positives and false negatives
    false_positive_samples = df_test.iloc[false_positives]
    false_negative_samples = df_test.iloc[false_negatives]

    # Display 10 of the false positives
    print("False Positives:")
    print(false_positive_samples.head(10))

    # Display 10 of the false negatives
    print("\nFalse Negatives:")
    print(false_negative_samples.head(10))

    # Plot cross-validation scores to detect overfitting
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, 11), cv_scores, marker='o', linestyle='--')
    plt.title(f'Cross-Validation F1 Scores for {target_name}')
    plt.xlabel('Fold')
    plt.ylabel('F1 Score')
    plt.ylim(0, 1)
    plt.grid(True)
    plt.show()

In [None]:
# Implementing the function on every category
grid_search_and_evaluate_2(X_train_q_res, y_train_q_res, X_test_manual_q, y_test_manual_q, "is_question")
grid_search_and_evaluate_2(X_train_c_res, y_train_c_res, X_test_manual_c, y_test_manual_c, 'Concern')
grid_search_and_evaluate_2(X_train_d_res, y_train_d_res, X_test_manual_d, y_test_manual_d, 'Doubt')