Importing libraries

In [None]:
# Importing libraries
import pandas as pd
import sklearn
import numpy  as np
import string
import re

#NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import spacy
from spacy.symbols import nsubj, VERB
nlp = spacy.load("nl_core_news_sm")
#from spacy.lang.nl.stop_words import STOP_WORDS

# Model implementation
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
lemmatizer = nltk.stem.WordNetLemmatizer()

Loading datasets

In [None]:
df = pd.read_csv("classification_cleaned_data.csv")

df_test_q = pd.read_csv("question_training_dataset.csv")
df_test_c = pd.read_csv("concern_training_dataset.csv")
df_test_d = pd.read_csv("doubt_training_dataset.csv")

In [None]:
# Splitting the dataset into training and remaining data
train_data, remaining_data = train_test_split(df, test_size=0.4, random_state=42)  # 60% training, 40% remaining
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)  # 20% validation, 20% test

Loading pre trained embedding model

In [None]:
# Pretrained embedding model
model_path = '/home/bekkalim/Documents/dutch-word-embeddings/model.bin'

# Load the pre-trained Word2Vec model
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
# Function to change text into vector with normalization
def text_to_vector_norm(text):
    words = text.split()  # Tokenize the text string into words
    word_vectors = []

    for word in words:
        if word in model.key_to_index:  # Check if the word is in the model's vocabulary
            word_vectors.append(model[word])

    if word_vectors:  # If there is at least one word vector, calculate the mean vector
        document_vector = np.mean(word_vectors, axis=0)
    else:  # If no words in the text are in the model's vocabulary, return a zero vector
        document_vector = np.zeros(model.vector_size)

    # Normalize the document vector to unit norm
    document_vector = normalize([document_vector])[0]

    return document_vector

In [None]:
# Checking if data is a string
train_data['clean_text'] = train_data['clean_text'].astype(str)
test_data['clean_text'] = test_data['clean_text'].astype(str)
val_data['clean_text'] = val_data['clean_text'].astype(str)

In [None]:
# Applying function to every dataset
train_data['text_vector'] = train_data['clean_text'].apply(text_to_vector_norm)
test_data['text_vector'] = test_data['clean_text'].apply(text_to_vector_norm)
val_data['text_vector'] = val_data['clean_text'].apply(text_to_vector_norm)

df_test_q['text_vector'] = df_test_q['clean_text'].apply(text_to_vector_norm)
df_test_c['text_vector'] = df_test_c['clean_text'].apply(text_to_vector_norm)
df_test_d['text_vector'] = df_test_d['clean_text'].apply(text_to_vector_norm)

Oversampling

In [None]:
# Implementing smote oversampling
smote = SMOTE(random_state=42)
X_train_q_res, y_train_q_res = smote.fit_resample(X_train, y_train_q)
X_train_c_res, y_train_c_res = smote.fit_resample(X_train, y_train_c)
X_train_d_res, y_train_d_res = smote.fit_resample(X_train, y_train_d)

# Convert the result to a pandas Series to use value_counts()
y_train_q_res_series = pd.Series(y_train_q_res)
y_train_c_res_series = pd.Series(y_train_c_res)
y_train_d_res_series = pd.Series(y_train_d_res)

# Count the occurrences of each class
class_counts_q = y_train_q_res_series.value_counts()
print(class_counts_q)

# Count the occurrences of each class
class_counts_c = y_train_c_res_series.value_counts()
print(class_counts_c)

# Count the occurrences of each class
class_counts_d = y_train_d_res_series.value_counts()
print(class_counts_d)


In [None]:
# Function for gridseach on parameter tuning
def grid_search_and_evaluate_logreg_2(X_train, y_train, X_test, y_test, target_name):
    # Defining the parameter grid for grid search
    param_grid = {
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Inverse of regularization strength
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }

    # Set up the grid search with cross-validation
    grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1_macro')

    # Fit the grid search on the resampled training data
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {target_name}: ", grid_search.best_params_)

    # Train the best model
    best_logreg_model = grid_search.best_estimator_
    best_logreg_model.fit(X_train, y_train)

    # Perform cross-validation on the resampled training data
    cv_scores = cross_val_score(best_logreg_model, X_train, y_train, cv=10, scoring='f1_macro')
    print(f"Cross-validation scores for {target_name}: {cv_scores}")
    print(f"Mean cross-validation score for {target_name}: {np.mean(cv_scores)}")

    # Predict and evaluate on the test set
    y_preds = best_logreg_model.predict(X_test)
    print(f"Classification report for {target_name}:")
    print(classification_report(y_test, y_preds, digits=4))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['False', 'True'], yticklabels=['False', 'True'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.show()

    # Identify false positives and false negatives
    false_positives = np.where((y_preds == 1) & (y_test == 0))[0]
    false_negatives = np.where((y_preds == 0) & (y_test == 1))[0]

    # Assuming X_test_original is a DataFrame or Series
    X_test_list = X_test_original.tolist()

    # Create DataFrame for test data
    df_test = pd.DataFrame({'clean_text': X_test_list, 'actual': y_test, 'predicted': y_preds})

    # Extract false positives and false negatives
    false_positive_samples = df_test.iloc[false_positives]
    false_negative_samples = df_test.iloc[false_negatives]

    # Display 10 of the false positives
    print("False Positives:")
    print(false_positive_samples.head(10))

    # Display 10 of the false negatives
    print("\nFalse Negatives:")
    print(false_negative_samples.head(10))

    # Plotting cross-validation scores
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(cv_scores) + 1), cv_scores, marker='o', linestyle='--', color='b', label='Cross-Validation F1 Score')
    plt.axhline(np.mean(cv_scores), color='r', linestyle='-', label=f'Mean F1 Score: {np.mean(cv_scores):.4f}')
    plt.title(f'Cross-Validation F1 Scores for {target_name}')
    plt.xlabel('Fold')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.show()

    return best_logreg_model


In [None]:
grid_search_and_evaluate_logreg_2(X_train_q_res, y_train_q_res, X_test_manual_q, y_test_manual_q, "is_question")
grid_search_and_evaluate_logreg_2(X_train_c_res, y_train_c_res, X_test_manual_c, y_test_manual_c, "is_concern")
grid_search_and_evaluate_logreg_2(X_train_d_res, y_train_d_res, X_test_manual_d, y_test_manual_d, "is_doubt")