# Classification

## 1. Data Exploration

In [12]:
import pandas as pd
import numpy as np
import os

# File paths
data_dir = "../data"

# File names for training and test datasets
train_files = ["TrainData1.txt", "TrainData2.txt", "TrainData3.txt", "TrainData4.txt", "TrainData5.txt"]
train_label_files = ["TrainLabel1.txt", "TrainLabel2.txt", "TrainLabel3.txt", "TrainLabel4.txt", "TrainLabel5.txt"]
test_files = ["TestData1.txt", "TestData2.txt", "TestData3.txt", "TestData4.txt", "TestData5.txt"]

# Load each dataset into separate variables
train_datasets = {}
train_labels = {}
test_datasets = {}

for i in range(len(train_files)):
    # Load training data and label using read_table for whitespace-separated data
    train_datasets[f"train_data_{i+1}"] = pd.read_table(os.path.join(data_dir, train_files[i]), header=None, delim_whitespace=True)
    train_labels[f"train_label_{i+1}"] = pd.read_table(os.path.join(data_dir, train_label_files[i]), header=None, delim_whitespace=True)
    test_datasets[f"test_data_{i+1}"] = pd.read_table(os.path.join(data_dir, test_files[i]), header=None, delim_whitespace=True)
    
    # Display basic information for each dataset
    print(f"Train Data {i+1} - Dimensions: {train_datasets[f'train_data_{i+1}'].shape}")
    print(f"Train Label {i+1} - Dimensions: {train_labels[f'train_label_{i+1}'].shape}")
    print(f"Test Data {i+1} - Dimensions: {test_datasets[f'test_data_{i+1}'].shape}")
    
    # Check for missing values
    missing_count = (train_datasets[f"train_data_{i+1}"] == 1.00000000000000e+99).sum().sum()
    print(f"Train Data {i+1} - Missing Values: {missing_count}")
    
    missing_count_test = (test_datasets[f"test_data_{i+1}"] == 1.00000000000000e+99).sum().sum()
    print(f"Test Data {i+1} - Missing Values: {missing_count_test}")
    print("-" * 40)


Train Data 1 - Dimensions: (150, 3312)
Train Label 1 - Dimensions: (150, 1)
Test Data 1 - Dimensions: (53, 3312)
Train Data 1 - Missing Values: 9936
Test Data 1 - Missing Values: 7021
----------------------------------------
Train Data 2 - Dimensions: (100, 9182)
Train Label 2 - Dimensions: (100, 1)
Test Data 2 - Dimensions: (74, 9182)
Train Data 2 - Missing Values: 0
Test Data 2 - Missing Values: 0
----------------------------------------
Train Data 3 - Dimensions: (6300, 13)
Train Label 3 - Dimensions: (6300, 1)
Test Data 3 - Dimensions: (2693, 13)
Train Data 3 - Missing Values: 1886
Test Data 3 - Missing Values: 0
----------------------------------------
Train Data 4 - Dimensions: (2547, 112)
Train Label 4 - Dimensions: (2547, 1)
Test Data 4 - Dimensions: (1092, 112)
Train Data 4 - Missing Values: 0
Test Data 4 - Missing Values: 0
----------------------------------------
Train Data 5 - Dimensions: (1119, 11)
Train Label 5 - Dimensions: (1119, 1)
Test Data 5 - Dimensions: (480, 11)
T

## 2. Data Preprocessing

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Loop through each dataset to preprocess
for i in range(len(train_files)):
    # Load the datasets again in case they were modified
    train_data = train_datasets[f"train_data_{i+1}"]
    test_data = test_datasets[f"test_data_{i+1}"]
    
    # Step 1: Handle missing values (replace 1.00000000000000e+99 with NaN)
    train_data.replace(1.00000000000000e+99, np.nan, inplace=True)
    test_data.replace(1.00000000000000e+99, np.nan, inplace=True)
    
    # For numerical features, we will use the mean to fill missing values
    train_data.fillna(train_data.mean(), inplace=True)
    test_data.fillna(test_data.mean(), inplace=True)
    
    # Step 2: Standardize high-dimensional datasets (only for those with many features)
    # Standardize TrainData1 and TrainData2 (with many features)
    if train_data.shape[1] > 100:  # If the dataset has a large number of features
        scaler = StandardScaler()
        train_data = scaler.fit_transform(train_data)
        test_data = scaler.transform(test_data)
    
    # Save the processed data back into the datasets dictionary
    train_datasets[f"train_data_{i+1}"] = train_data
    test_datasets[f"test_data_{i+1}"] = test_data
    
    # Step 3: Print the updated info
    print(f"Processed Train Data {i+1} - Dimensions: {train_data.shape}")
    print(f"Processed Test Data {i+1} - Dimensions: {test_data.shape}")
    print("-" * 40)

Processed Train Data 1 - Dimensions: (150, 3312)
Processed Test Data 1 - Dimensions: (53, 3312)
----------------------------------------
Processed Train Data 2 - Dimensions: (100, 9182)
Processed Test Data 2 - Dimensions: (74, 9182)
----------------------------------------
Processed Train Data 3 - Dimensions: (6300, 13)
Processed Test Data 3 - Dimensions: (2693, 13)
----------------------------------------
Processed Train Data 4 - Dimensions: (2547, 112)
Processed Test Data 4 - Dimensions: (1092, 112)
----------------------------------------
Processed Train Data 5 - Dimensions: (1119, 11)
Processed Test Data 5 - Dimensions: (480, 11)
----------------------------------------


## 3. Model Training

In [14]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib  # To save the trained models

# Function to select and train models
def model_selection_and_training(train_data, train_label, test_data, dataset_index):
    if train_data.shape[1] > 100:  # Many features, fewer samples: Use SVM
        model = SVC(kernel='linear')  # Linear kernel for simplicity
        print(f"Training SVM for TrainData {dataset_index}...")
    else:  # Few features, many samples: Use KNN
        model = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors for KNN
        print(f"Training KNN for TrainData {dataset_index}...")
    
    # Train the model
    model.fit(train_data, train_label)
    
    # Save the trained model to a file
    joblib.dump(model, f"../models/model_{dataset_index}.pkl")
    print(f"Model for TrainData {dataset_index} saved.")
    print("-" * 40)

# Loop through each dataset and apply the model selection and training process
for i in range(1, 6):
    # Load the training data and labels
    train_data = train_datasets[f"train_data_{i}"]
    train_label = train_labels[f"train_label_{i}"].values.flatten()  # Flattening the label array
    
    # Call the function to train the model
    model_selection_and_training(train_data, train_label, test_datasets[f"test_data_{i}"], i)


Training SVM for TrainData 1...
Model for TrainData 1 saved.
----------------------------------------
Training SVM for TrainData 2...
Model for TrainData 2 saved.
----------------------------------------
Training KNN for TrainData 3...
Model for TrainData 3 saved.
----------------------------------------
Training SVM for TrainData 4...
Model for TrainData 4 saved.
----------------------------------------
Training KNN for TrainData 5...
Model for TrainData 5 saved.
----------------------------------------


## 4. Store Predictions

In [18]:
import pandas as pd
import numpy as np
import os
import joblib

# File paths
model_dir = "../models"
results_dir = "../results"

# Loop through each dataset, load model, make predictions and save the results
for i in range(1, 6):  # Loop through datasets 1 to 5
    print(f"Making predictions for Test Data {i}...")

    # Load the trained model
    model = joblib.load(os.path.join(model_dir, f"model_{i}.pkl"))
    
    # Get the corresponding test data
    test_data = test_datasets[f"test_data_{i}"]
    
    # Make predictions using the model
    predictions = model.predict(test_data)
    
    # Save the predictions to a file
    results_file = os.path.join(results_dir, f"predictions_test_{i}.txt")
    np.savetxt(results_file, predictions, fmt='%d', delimiter="\n")
    
    print(f"Predictions for Test Data {i} saved to {results_file}")
    print("-" * 40)


Making predictions for Test Data 1...
Predictions for Test Data 1 saved to ../results/predictions_test_1.txt
----------------------------------------
Making predictions for Test Data 2...
Predictions for Test Data 2 saved to ../results/predictions_test_2.txt
----------------------------------------
Making predictions for Test Data 3...
Predictions for Test Data 3 saved to ../results/predictions_test_3.txt
----------------------------------------
Making predictions for Test Data 4...
Predictions for Test Data 4 saved to ../results/predictions_test_4.txt
----------------------------------------
Making predictions for Test Data 5...
Predictions for Test Data 5 saved to ../results/predictions_test_5.txt
----------------------------------------
