## Nick van der Linde Classification Hackathon

This is the notebook for the 2023 Classification hackathon Challenge


## Import Libraries


In [25]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle


## Loading the Data

In [26]:
# Load the training and test datasets
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

## Preprocessing 

In [27]:
# Turn all charactars into lowercase
train_df['text'] = train_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

# Remove all punctuation
train_df['text'] = train_df['text'].str.replace('[^\w\s]', '')
test_df['text'] = test_df['text'].str.replace('[^\w\s]', '')

# Remove all numbers
train_df['text'] = train_df['text'].str.replace('\d+', '')
test_df['text'] = test_df['text'].str.replace('\d+', '')



## Training the Model

I used a Naive Bayes Model

In [28]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['lang_id'], test_size=0.2, random_state=42
)

In [29]:
# Start the Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [30]:
# Fit the Vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Train Naive Bayes model
nb_model = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=False)
nb_model.fit(X_train_tfidf, y_train)

# Create predictions
y_pred_nb = nb_model.predict(X_val_tfidf)

# Look at F1-Score for model
print("Mean F1-Score (Naive Bayes):", f1_score(y_val, y_pred_nb, average='micro'))


Mean F1-Score (Naive Bayes): 0.9984848484848485


## Hyperparameter Tuning for Model

In [31]:
# Extract unique classes
unique_classes = y_train.unique()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False],
    'class_prior': [None] + [1/len(unique_classes)] * len(unique_classes), 
}

# Initialize GridSearchCV for Naive Bayes
grid_search = GridSearchCV(
    estimator=nb_model,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

# Fit more hyperparameters
grid_search.fit(X_train_tfidf, y_train)
 
# Make predictions
y_pred_nb = grid_search.best_estimator_.predict(X_val_tfidf)



550 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
550 fits failed with the following error:
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Applications/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Applications/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_param

## Creating Submission

In [32]:
# Transform the test set with the Vectorizer
X_test_tfidf_nb = tfidf_vectorizer.transform(test_df['text'])

# Make predictions
test_predictions_nb = nb_model.predict(X_test_tfidf_nb)

# Create and save a submission
submission_df_nb = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions_nb})
submission_df_nb.to_csv('Nick_VDL_Hackathon_Submission.csv', index=False)
