# Classification

## 1. Data Exploration

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# File paths
data_dir = "../data"

# File names for training and test datasets
train_files = ["TrainData1.txt", "TrainData2.txt", "TrainData3.txt", "TrainData4.txt", "TrainData5.txt"]
train_label_files = ["TrainLabel1.txt", "TrainLabel2.txt", "TrainLabel3.txt", "TrainLabel4.txt", "TrainLabel5.txt"]
test_files = ["TestData1.txt", "TestData2.txt", "TestData3.txt", "TestData4.txt", "TestData5.txt"]

# Load each dataset into separate variables
train_datasets = {}
validation_datasets = {}
train_labels = {}
validation_labels = {}
test_datasets = {}

for i in range(len(train_files)):
    # Load training data and label using read_table for whitespace-separated data
    full_train_data = pd.read_table(os.path.join(data_dir, train_files[i]), header=None, delim_whitespace=True)
    full_train_label = pd.read_table(os.path.join(data_dir, train_label_files[i]), header=None, delim_whitespace=True)
    test_data = pd.read_table(os.path.join(data_dir, test_files[i]), header=None, delim_whitespace=True)
    
    # Split the training data into train and validation sets
    train_data, val_data, train_label, val_label = train_test_split(
        full_train_data, full_train_label, test_size=0.2, random_state=42
    )
    
    # Store datasets in dictionaries
    train_datasets[f"train_data_{i+1}"] = train_data
    validation_datasets[f"val_data_{i+1}"] = val_data
    train_labels[f"train_label_{i+1}"] = train_label
    validation_labels[f"val_label_{i+1}"] = val_label
    test_datasets[f"test_data_{i+1}"] = test_data
    
    # Display basic information for each dataset
    print(f"Train Data {i+1} - Dimensions: {train_data.shape}")
    print(f"Validation Data {i+1} - Dimensions: {val_data.shape}")
    print(f"Test Data {i+1} - Dimensions: {test_data.shape}")
    
    # Check for missing values in the training and validation data
    missing_count_train = (train_data == 1.00000000000000e+99).sum().sum()
    missing_count_val = (val_data == 1.00000000000000e+99).sum().sum()
    missing_count_test = (test_data == 1.00000000000000e+99).sum().sum()
    
    print(f"Train Data {i+1} - Missing Values: {missing_count_train}")
    print(f"Validation Data {i+1} - Missing Values: {missing_count_val}")
    print(f"Test Data {i+1} - Missing Values: {missing_count_test}")
    print("-" * 40)


Train Data 1 - Dimensions: (120, 3312)
Validation Data 1 - Dimensions: (30, 3312)
Test Data 1 - Dimensions: (53, 3312)
Train Data 1 - Missing Values: 8013
Validation Data 1 - Missing Values: 1923
Test Data 1 - Missing Values: 7021
----------------------------------------
Train Data 2 - Dimensions: (80, 9182)
Validation Data 2 - Dimensions: (20, 9182)
Test Data 2 - Dimensions: (74, 9182)
Train Data 2 - Missing Values: 0
Validation Data 2 - Missing Values: 0
Test Data 2 - Missing Values: 0
----------------------------------------
Train Data 3 - Dimensions: (5040, 13)
Validation Data 3 - Dimensions: (1260, 13)
Test Data 3 - Dimensions: (2693, 13)
Train Data 3 - Missing Values: 1551
Validation Data 3 - Missing Values: 335
Test Data 3 - Missing Values: 0
----------------------------------------
Train Data 4 - Dimensions: (2037, 112)
Validation Data 4 - Dimensions: (510, 112)
Test Data 4 - Dimensions: (1092, 112)
Train Data 4 - Missing Values: 0
Validation Data 4 - Missing Values: 0
Test Dat

## 2. Data Preprocessing

In [2]:
from sklearn.preprocessing import StandardScaler

# Preprocess each dataset's training and validation data
for i in range(len(train_files)):
    # Access the train and validation sets
    train_data = train_datasets[f"train_data_{i+1}"]
    val_data = validation_datasets[f"val_data_{i+1}"]
    
    # Step 1: Handle missing values (replace 1.00000000000000e+99 with NaN)
    train_data.replace(1.00000000000000e+99, np.nan, inplace=True)
    val_data.replace(1.00000000000000e+99, np.nan, inplace=True)
    
    # For numerical features, we use the mean of the training data to fill missing values
    train_data.fillna(train_data.mean(), inplace=True)
    val_data.fillna(val_data.mean(), inplace=True)  # Use means from val_data
    
    # Step 2: Standardize datasets with high dimensionality
    if train_data.shape[1] > 100:  # If the dataset has many features
        scaler = StandardScaler()
        train_data = scaler.fit_transform(train_data)
        val_data = scaler.transform(val_data)  # Use the same scaler on validation data
    
    # Save the processed data back into the datasets dictionary
    train_datasets[f"train_data_{i+1}"] = train_data
    validation_datasets[f"val_data_{i+1}"] = val_data
    
    # Step 3: Print the updated info
    print(f"Processed Train Data {i+1} - Dimensions: {train_data.shape}")
    print(f"Processed Validation Data {i+1} - Dimensions: {val_data.shape}")
    print("-" * 40)


Processed Train Data 1 - Dimensions: (120, 3312)
Processed Validation Data 1 - Dimensions: (30, 3312)
----------------------------------------
Processed Train Data 2 - Dimensions: (80, 9182)
Processed Validation Data 2 - Dimensions: (20, 9182)
----------------------------------------
Processed Train Data 3 - Dimensions: (5040, 13)
Processed Validation Data 3 - Dimensions: (1260, 13)
----------------------------------------
Processed Train Data 4 - Dimensions: (2037, 112)
Processed Validation Data 4 - Dimensions: (510, 112)
----------------------------------------
Processed Train Data 5 - Dimensions: (895, 11)
Processed Validation Data 5 - Dimensions: (224, 11)
----------------------------------------


## 3. Model Training

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # Support Vector Machine
import joblib  # To save the trained models

# Define the model mapping for each dataset
model_mapping = {
    1: KNeighborsClassifier(n_neighbors=5),  # Dataset 1: KNN
    2: RandomForestClassifier(n_estimators=100, random_state=42),  # Dataset 2: Random Forest
    3: SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),  # Dataset 3: SVM with RBF Kernel
    4: RandomForestClassifier(n_estimators=100, random_state=42),  # Dataset 4: Random Forest
    5: RandomForestClassifier(n_estimators=100, random_state=42),  # Dataset 5: Random Forest
}

# Loop through each dataset and train the respective model
for dataset_index, model in model_mapping.items():
    print(f"Training model for Dataset {dataset_index}...")
    
    # Load the corresponding training data and labels
    train_data = train_datasets[f"train_data_{dataset_index}"]
    train_label = train_labels[f"train_label_{dataset_index}"].values.flatten()
    
    # Train the model
    model.fit(train_data, train_label)
    
    # Save the trained model to a file
    joblib.dump(model, f"../models/model_{dataset_index}.pkl")
    print(f"Model for Dataset {dataset_index} saved.")
    print("-" * 40)


Training model for Dataset 1...
Model for Dataset 1 saved.
----------------------------------------
Training model for Dataset 2...
Model for Dataset 2 saved.
----------------------------------------
Training model for Dataset 3...
Model for Dataset 3 saved.
----------------------------------------
Training model for Dataset 4...
Model for Dataset 4 saved.
----------------------------------------
Training model for Dataset 5...
Model for Dataset 5 saved.
----------------------------------------


## 4. Store Predictions

In [33]:
import pandas as pd
import numpy as np
import os
import joblib

# File paths
model_dir = "../models"
results_dir = "../results"

# Loop through each dataset, load model, make predictions and save the results
for i in range(1, 6):  # Loop through datasets 1 to 5
    print(f"Making predictions for Validation Data {i}...")

    # Load the trained model
    model = joblib.load(os.path.join(model_dir, f"model_{i}.pkl"))
    
    # Get the corresponding validation data (not test data)
    validation_data = validation_datasets[f"val_data_{i}"]
    
    # Make predictions using the model
    predictions = model.predict(validation_data)
    
    # Save the predictions to a file
    predictions_file = os.path.join(results_dir, f"predictions_val_{i}.txt")
    np.savetxt(predictions_file, predictions, fmt='%d', delimiter="\n")
    
    print(f"Predictions for Validation Data {i} saved to {predictions_file}")
    print("-" * 40)


Making predictions for Validation Data 1...
Predictions for Validation Data 1 saved to ../results/predictions_val_1.txt
----------------------------------------
Making predictions for Validation Data 2...
Predictions for Validation Data 2 saved to ../results/predictions_val_2.txt
----------------------------------------
Making predictions for Validation Data 3...
Predictions for Validation Data 3 saved to ../results/predictions_val_3.txt
----------------------------------------
Making predictions for Validation Data 4...
Predictions for Validation Data 4 saved to ../results/predictions_val_4.txt
----------------------------------------
Making predictions for Validation Data 5...
Predictions for Validation Data 5 saved to ../results/predictions_val_5.txt
----------------------------------------


## 5. Model Evaluation

In [31]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# File paths
results_dir = "../results"

# Dictionary to store evaluation metrics
evaluation_metrics = {}

for i in range(1, 6):  # Loop through datasets 1 to 5
    print(f"Evaluating predictions for Validation Data {i}...")

    # Load the ground truth validation labels
    true_labels = validation_labels[f"val_label_{i}"].values.flatten()  # Flatten to ensure it's a 1D array
    
    # Load saved predictions for the validation set
    predictions_file = os.path.join(results_dir, f"predictions_val_{i}.txt")
    predictions = np.loadtxt(predictions_file, dtype=int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted', zero_division=1)
    recall = recall_score(true_labels, predictions, average='weighted', zero_division=1)
    f1 = f1_score(true_labels, predictions, average='weighted', zero_division=1)

    # Store metrics
    evaluation_metrics[f"Dataset {i}"] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

    # Print metrics
    print(f"Dataset {i} Evaluation Metrics:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print("-" * 40)

# Save the evaluation metrics
evaluation_metrics_df = pd.DataFrame(evaluation_metrics).T  # Transpose for readability
evaluation_metrics_df.to_csv(os.path.join(results_dir, "evaluation_metrics.csv"))
print("Evaluation metrics saved to evaluation_metrics.csv")


Evaluating predictions for Validation Data 1...
Dataset 1 Evaluation Metrics:
  Accuracy: 0.9000
  Precision: 0.9120
  Recall: 0.9000
  F1 Score: 0.8799
----------------------------------------
Evaluating predictions for Validation Data 2...
Dataset 2 Evaluation Metrics:
  Accuracy: 0.9500
  Precision: 1.0000
  Recall: 0.9500
  F1 Score: 0.9667
----------------------------------------
Evaluating predictions for Validation Data 3...
Dataset 3 Evaluation Metrics:
  Accuracy: 0.3532
  Precision: 0.3189
  Recall: 0.3532
  F1 Score: 0.3064
----------------------------------------
Evaluating predictions for Validation Data 4...
Dataset 4 Evaluation Metrics:
  Accuracy: 0.9451
  Precision: 0.9469
  Recall: 0.9451
  F1 Score: 0.9454
----------------------------------------
Evaluating predictions for Validation Data 5...
Dataset 5 Evaluation Metrics:
  Accuracy: 0.6696
  Precision: 0.6703
  Recall: 0.6696
  F1 Score: 0.6951
----------------------------------------
Evaluation metrics saved to ev