In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import tensorflow as tf

import json
import os
import math
import librosa
import numpy as np
import pandas as pd
import pathlib

DATASET_PATH = "Dataset/"
BATCH_SIZE = 32

In [2]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
train_df = pd.read_csv(DATASET_PATH + "train.csv")
test_df = pd.read_csv(DATASET_PATH + "test.csv")

In [9]:
train_df.shape

(5250, 1201)

In [33]:
# Split the dataset into features and labels
X = train_df.iloc[:, 1:]  # Features (all columns except the first)
y = train_df.iloc[:, 0]  # Labels (first column)

# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = X_train.values, X_val.values, y_train.values, y_val.values

## RandomForest

In [34]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred)
print("F1 score on the validation set:", f1)

F1 score on the validation set: 0.6205128205128205


## Simple CNN

In [35]:
# Normalize pixel values between 0 and 1
X_train = X_train.astype('float32') / 255.0
X_val = X_val.astype('float32') / 255.0

# Reshape the data into the required format (assuming images are 20x20x3)
X_train = X_train.reshape(-1, 20, 20, 3)
X_val = X_val.reshape(-1, 20, 20, 3)

In [52]:
import tensorflow as tf

# Define the CNN model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(20, 20, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_m])

# Train the model on the training set
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_data = (X_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x166465a10>

In [53]:
# Evaluate the model on the validation set
y_pred_proba = model.predict(X_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
f1 = f1_score(y_val, y_pred)

print("F1 score on the validation set:", f1)

F1 score on the validation set: 0.6220472440944882


## Complex CNN

In [55]:
import tensorflow as tf

# Define the CNN model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(20, 20, 3)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_m])

# Train the model on the training set
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_data = (X_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1664206d0>

In [56]:
# Evaluate the model on the validation set
y_pred_proba = model.predict(X_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
f1 = f1_score(y_val, y_pred)

print("F1 score on the validation set:", f1)

F1 score on the validation set: 0.5542168674698795


## MesoNet

In [51]:
# Split the dataset into features and labels
X = train_df.iloc[:, 1:]  # Features (all columns except the first)
y = train_df.iloc[:, 0]  # Labels (first column)

# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = X_train.values, X_val.values, y_train.values, y_val.values

# Normalize pixel values between 0 and 1
X_train = X_train.astype('float32') / 255.0
X_val = X_val.astype('float32') / 255.0

# Reshape the data into the required format (assuming images are 20x20x3)
X_train = X_train.reshape(-1, 20, 20, 3)
X_val = X_val.reshape(-1, 20, 20, 3)

In [63]:
# Define the model architecture
def create_model():
    x = Input(shape=(20, 20, 3))
    x1 = Conv2D(16, (3, 3), padding='same', activation='relu')(x)
    x1 = BatchNormalization()(x1)
    x1 = MaxPooling2D(pool_size=(2, 2), padding='same')(x1)
    x2 = Conv2D(32, (5, 5), padding='same', activation='relu')(x1)
    x2 = BatchNormalization()(x2)
    x2 = MaxPooling2D(pool_size=(2, 2), padding='same')(x2)
    x3 = Conv2D(32, (5, 5), padding='same', activation='relu')(x2)
    x3 = BatchNormalization()(x3)
    x3 = MaxPooling2D(pool_size=(2, 2), padding='same')(x3)
    x4 = Conv2D(64, (5, 5), padding='same', activation='relu')(x3)
    x4 = BatchNormalization()(x4)
    x4 = MaxPooling2D(pool_size=(4, 4), padding='same')(x4)
    y = Flatten()(x4)
    y = Dropout(0.5)(y)
    y = Dense(32)(y)
    y = LeakyReLU(alpha=0.1)(y)
    y = Dense(16)(y)
    y = Dropout(0.5)(y)
    y = Dense(1, activation='sigmoid')(y)
    return tf.keras.Model(inputs=x, outputs=y)

# Create the model
model = create_model()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_m])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x2f35887d0>

In [64]:
# Evaluate the model on the validation set
y_pred_proba = model.predict(X_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
f1 = f1_score(y_val, y_pred)

print("F1 score on the validation set:", f1)

F1 score on the validation set: 0.609375


In [65]:
# Define the model architecture
def create_model():
    x = Input(shape=(20, 20, 3))
    x1 = Conv2D(16, (3, 3), padding='same', activation='relu')(x)
    x1 = BatchNormalization()(x1)
    x1 = MaxPooling2D(pool_size=(2, 2), padding='same')(x1)
    x2 = Conv2D(32, (5, 5), padding='same', activation='relu')(x1)
    x2 = BatchNormalization()(x2)
    x2 = MaxPooling2D(pool_size=(2, 2), padding='same')(x2)
    x3 = Conv2D(32, (5, 5), padding='same', activation='relu')(x2)
    x3 = BatchNormalization()(x3)
    x3 = MaxPooling2D(pool_size=(2, 2), padding='same')(x3)
    x4 = Conv2D(64, (5, 5), padding='same', activation='relu')(x3)
    x4 = BatchNormalization()(x4)
    x4 = MaxPooling2D(pool_size=(4, 4), padding='same')(x4)
    y = Flatten()(x4)
    y = Dropout(0.5)(y)
    y = Dense(32)(y)
    y = LeakyReLU(alpha=0.1)(y)
    y = Dense(16)(y)
    y = Dropout(0.5)(y)
    y = Dense(1, activation='sigmoid')(y)
    return tf.keras.Model(inputs=x, outputs=y)

# Create the model
model = create_model()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_m])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=1000, validation_data=(X_val, y_val))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

KeyboardInterrupt: 

In [None]:
# Evaluate the model on the validation set
y_pred_proba = model.predict(X_val)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
f1 = f1_score(y_val, y_pred)

print("F1 score on the validation set:", f1)

In [None]:
y_pred_proba

## Supervised series

In [68]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Define the models
models = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("KNN", KNeighborsClassifier()),
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Naive Bayes", GaussianNB())
]

# Train and evaluate each model
for model_name, model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    print(f"{model_name}: F1 Score = {np.mean(scores)}")

Decision Tree: F1 Score = 0.5690915409772377
Random Forest: F1 Score = 0.45226782589262376
XGBoost: F1 Score = 0.5698222598094936
KNN: F1 Score = 0.6642723106907817


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression: F1 Score = 0.6219035677561024
SVM: F1 Score = 0.4683916624918131
Naive Bayes: F1 Score = 0.6527170284542594


In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Define the parameter grid for KNN
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 13, 21, 50, 100],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Perform GridSearchCV for KNN
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, knn_params, cv=10, scoring='f1')
knn_grid.fit(X, y)

# Print the best hyperparameters and F1 score for KNN
print("Best Hyperparameters for KNN:", knn_grid.best_params_)
print("Best F1 Score for KNN:", knn_grid.best_score_)

# Define the parameter grid for Naive Bayes
nb_params = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5, 1e-3, 1e-1]
}

# Perform GridSearchCV for Naive Bayes
nb = GaussianNB()
nb_grid = GridSearchCV(nb, nb_params, cv=10, scoring='f1')
nb_grid.fit(X, y)

# Print the best hyperparameters and F1 score for Naive Bayes
print("Best Hyperparameters for Naive Bayes:", nb_grid.best_params_)
print("Best F1 Score for Naive Bayes:", nb_grid.best_score_)


Best Hyperparameters for KNN: {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
Best F1 Score for KNN: 0.7557257130860456
Best Hyperparameters for Naive Bayes: {'var_smoothing': 0.1}
Best F1 Score for Naive Bayes: 0.6763954037011807


In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Define the parameter grid for Naive Bayes
nb_params = {
    'var_smoothing': [1e-9, 1e-5, 1e-3, 1e-1, 0]
}

# Perform GridSearchCV for Naive Bayes
nb = GaussianNB()
nb_grid = GridSearchCV(nb, nb_params, cv=10, scoring='f1')
nb_grid.fit(X, y)

# Print the best hyperparameters and F1 score for Naive Bayes
print("Best Hyperparameters for Naive Bayes:", nb_grid.best_params_)
print("Best F1 Score for Naive Bayes:", nb_grid.best_score_)



from sklearn.naive_bayes import BernoulliNB

bnb_params = {
    'alpha': [0.1, 0.5, 1.0],
    'fit_prior': [True, False],
    'binarize': [0.0, 0.5, 1.0]
}

bnb = BernoulliNB()
bnb_grid = GridSearchCV(bnb, bnb_params, cv=10, scoring='f1')
bnb_grid.fit(X, y)

print("Best Hyperparameters for Bernoulli Naive Bayes:", bnb_grid.best_params_)
print("Best F1 Score for Bernoulli Naive Bayes:", bnb_grid.best_score_)

Best Hyperparameters for Naive Bayes: {'var_smoothing': 0.1}
Best F1 Score for Naive Bayes: 0.6763954037011807
Best Hyperparameters for Bernoulli Naive Bayes: {'alpha': 0.5, 'binarize': 1.0, 'fit_prior': True}
Best F1 Score for Bernoulli Naive Bayes: 0.6442755278325175
