In [75]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

label_encoder = LabelEncoder()

train_data = pd.read_csv('new_train_data.csv', index_col=0)
test_data = pd.read_csv('new_test_data.csv', index_col=0)

X_train = train_data.drop(columns=['Label'])
y_train = label_encoder.fit_transform(train_data['Label'])

X_test = test_data.drop(columns=['Label'])
y_test = label_encoder.fit_transform(test_data['Label'])


In [7]:
print(X_train.shape)
print(X_test.shape)

(195516, 11)
(58469, 11)


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x


In [65]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        if x.dim() == 2:
            x = x.unsqueeze(1)
            h0 = h0.squeeze(1)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out


In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)



In [70]:
model = BinaryClassifier(11, 64, 5)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

num_epochs = 100
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.5259
Epoch [2/100], Loss: 0.5787
Epoch [3/100], Loss: 0.3615
Epoch [4/100], Loss: 0.4332
Epoch [5/100], Loss: 0.5633
Epoch [6/100], Loss: 0.4691
Epoch [7/100], Loss: 0.5043
Epoch [8/100], Loss: 0.4207
Epoch [9/100], Loss: 0.5097
Epoch [10/100], Loss: 0.4858
Epoch [11/100], Loss: 0.5513
Epoch [12/100], Loss: 0.3902
Epoch [13/100], Loss: 0.5192


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()

with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = (outputs > 0.5).float()
    total = y_test_tensor.size(0)
    correct = (predicted == y_test_tensor).sum().item()

print(f'Accuracy: {correct / total:.4f}')

# Convert the tensors to NumPy arrays for the metrics calculation
y_true = y_test_tensor.numpy()
y_pred = predicted.numpy()

# Calculate precision, recall, and F1 score for the positive class (label 1)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred, zero_division=1))

In [73]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set the parameters for the XGBoost model
params = {
    'max_depth': 5,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
}

# Train the XGBoost model
num_round = 200
bst = xgb.train(params, dtrain, num_round)

# Make predictions
y_pred = bst.predict(dtest)
y_pred = (y_pred > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8011
Confusion Matrix:
[[41953  4651]
 [ 6976  4889]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88     46604
           1       0.51      0.41      0.46     11865

    accuracy                           0.80     58469
   macro avg       0.68      0.66      0.67     58469
weighted avg       0.79      0.80      0.79     58469



In [50]:
svm_clf = SVC(kernel='rbf')
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)


print("Support Vector Machine (SVM):")
print("Accuracy:", acc_svm)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm, zero_division=1))

Support Vector Machine (SVM):
Accuracy: 0.7940618105320768
Confusion Matrix:
 [[45856   748]
 [11293   572]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.98      0.88     46604
           1       0.43      0.05      0.09     11865

    accuracy                           0.79     58469
   macro avg       0.62      0.52      0.49     58469
weighted avg       0.73      0.79      0.72     58469



In [51]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, zero_division=1))

Logistic Regression:
Accuracy: 0.7731105372077511
Confusion Matrix:
 [[43200  3404]
 [ 9862  2003]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.93      0.87     46604
           1       0.37      0.17      0.23     11865

    accuracy                           0.77     58469
   macro avg       0.59      0.55      0.55     58469
weighted avg       0.72      0.77      0.74     58469



In [48]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("k-Nearest Neighbors (k-NN):")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn, zero_division=1))

k-Nearest Neighbors (k-NN):
Accuracy: 0.7437445483931656
Confusion Matrix:
 [[38278  8326]
 [ 6657  5208]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.84     46604
           1       0.38      0.44      0.41     11865

    accuracy                           0.74     58469
   macro avg       0.62      0.63      0.62     58469
weighted avg       0.76      0.74      0.75     58469



In [76]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'random_state': [42],
}

# Create the Random Forest Classifier
rf = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Make predictions using the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 288 candidates, totalling 864 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, random_state=42; total time=   4.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, random_state=42; total time=   4.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, random_state=42; total time=   4.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42; total time=  19.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42; total time=  19.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42; total time=  19.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, random_state=42; total time=