RandomForestClassifier

In [39]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the data
train_data = np.genfromtxt('traindata.txt', delimiter=',')
train_labels = np.loadtxt('trainlabels.txt')

# Split the data into training, validation, and unseen data
train_data, unseen_data, train_labels, unseen_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Create a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(train_data, train_labels)

# Predict the labels for the validation and unseen data
val_predictions = classifier.predict(val_data)
unseen_predictions = classifier.predict(unseen_data)

# Calculate and print the accuracy on the validation and unseen data
val_accuracy = accuracy_score(val_labels, val_predictions)
unseen_accuracy = accuracy_score(unseen_labels, unseen_predictions)
print("Validation Accuracy:", val_accuracy)
print("Unseen Data Accuracy:", unseen_accuracy)


Validation Accuracy: 0.334375
Unseen Data Accuracy: 0.3795


In [45]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Load the data
train_data = np.genfromtxt('traindata.txt', delimiter=',')
train_labels = np.loadtxt('trainlabels.txt')

# Split the data into training, validation, and unseen data
train_data, unseen_data, train_labels, unseen_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
unseen_data = scaler.transform(unseen_data)

# Remove correlated features
corr_matrix = pd.DataFrame(train_data).corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
train_data = np.delete(train_data, to_drop, axis=1)
val_data = np.delete(val_data, to_drop, axis=1)
unseen_data = np.delete(unseen_data, to_drop, axis=1)

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
train_data = selector.fit_transform(train_data)
val_data = selector.transform(val_data)
unseen_data = selector.transform(unseen_data)

# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 20, None]
# }

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [200],
    'max_depth': [20]
}

# Create a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(classifier, param_grid, cv=5)

# Train the grid search object
grid_search.fit(train_data, train_labels)

# Print the best hyperparameters and their validation accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Validation Accuracy:", grid_search.best_score_)

# Create a Random Forest classifier with the best hyperparameters
best_classifier = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                                          max_depth=grid_search.best_params_['max_depth'], 
                                          random_state=42)

# Train the classifier with the best hyperparameters
best_classifier.fit(train_data, train_labels)

# Predict the labels for the validation and unseen data using the best classifier
val_predictions = best_classifier.predict(val_data)
unseen_predictions = best_classifier.predict(unseen_data)

# Calculate and print the accuracy on the validation and unseen data
val_accuracy = accuracy_score(val_labels, val_predictions)
unseen_accuracy = accuracy_score(unseen_labels, unseen_predictions)
print("Validation Accuracy:", val_accuracy)
print("Unseen Data Accuracy:", unseen_accuracy)

Best Hyperparameters: {'max_depth': 50, 'n_estimators': 500}
Validation Accuracy: 0.40265625
Validation Accuracy: 0.42375
Unseen Data Accuracy: 0.4485


In [50]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Load the data
train_data = np.genfromtxt('traindata.txt', delimiter=',')
train_labels = np.loadtxt('trainlabels.txt')

# Split the data into training, validation, and unseen data
train_data, unseen_data, train_labels, unseen_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
unseen_data = scaler.transform(unseen_data)

# Remove correlated features
corr_matrix = pd.DataFrame(train_data).corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
train_data = np.delete(train_data, to_drop, axis=1)
val_data = np.delete(val_data, to_drop, axis=1)
unseen_data = np.delete(unseen_data, to_drop, axis=1)

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
train_data = selector.fit_transform(train_data)
val_data = selector.transform(val_data)
unseen_data = selector.transform(unseen_data)

# Scale the features to a specific range
# Here, we are scaling the features to the range [0, 1]
min_val = np.min(train_data, axis=0)
max_val = np.max(train_data, axis=0)
train_data = (train_data - min_val) / (max_val - min_val)
val_data = (val_data - min_val) / (max_val - min_val)
unseen_data = (unseen_data - min_val) / (max_val - min_val)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [200],
    'max_depth': [20]
}

# Create a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(classifier, param_grid, cv=5)

# Train the grid search object
grid_search.fit(train_data, train_labels)

# Print the best hyperparameters and their validation accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Validation Accuracy:", grid_search.best_score_)

# Create a Random Forest classifier with the best hyperparameters
best_classifier = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                                          max_depth=grid_search.best_params_['max_depth'], 
                                          random_state=42)

# Train the classifier with the best hyperparameters
best_classifier.fit(train_data, train_labels)

# Predict the labels for the validation and unseen data using the best classifier
val_predictions = best_classifier.predict(val_data)
unseen_predictions = best_classifier.predict(unseen_data)

# Calculate and print the accuracy on the validation and unseen data
val_accuracy = accuracy_score(val_labels, val_predictions)
unseen_accuracy = accuracy_score(unseen_labels, unseen_predictions)
print("Validation Accuracy:", val_accuracy)
print("Unseen Data Accuracy:", unseen_accuracy)

Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}
Validation Accuracy: 0.36265625
Validation Accuracy: 0.42875
Unseen Data Accuracy: 0.443


In [48]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Load the data
train_data = np.genfromtxt('traindata.txt', delimiter=',')
train_labels = np.loadtxt('trainlabels.txt')

# Split the data into training, validation, and unseen data
train_data, unseen_data, train_labels, unseen_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
unseen_data = scaler.transform(unseen_data)

# Remove correlated features
corr_matrix = pd.DataFrame(train_data).corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
train_data = np.delete(train_data, to_drop, axis=1)
val_data = np.delete(val_data, to_drop, axis=1)
unseen_data = np.delete(unseen_data, to_drop, axis=1)

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
train_data = selector.fit_transform(train_data)
val_data = selector.transform(val_data)
unseen_data = selector.transform(unseen_data)

# Generate polynomial features for the selected features
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data[:, [0, 2]])
val_data_poly = poly.transform(val_data[:, [0, 2]])
unseen_data_poly = poly.transform(unseen_data[:, [0, 2]])

# Add the generated polynomial features to the dataset
train_data = np.concatenate((train_data, train_data_poly), axis=1)
val_data = np.concatenate((val_data, val_data_poly), axis=1)
unseen_data = np.concatenate((unseen_data, unseen_data_poly), axis=1)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [200],
    'max_depth': [20]
}

# Create a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(classifier, param_grid, cv=5)

# Train the grid search object
grid_search.fit(train_data, train_labels)

# Print the best hyperparameters and their validation accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Validation Accuracy:", grid_search.best_score_)

# Create a Random Forest classifier with the best hyperparameters
best_classifier = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                                          max_depth=grid_search.best_params_['max_depth'], 
                                          random_state=42)

# Train the classifier with the best hyperparameters
best_classifier.fit(train_data, train_labels)

# Predict the labels for the validation and unseen data using the best classifier
val_predictions = best_classifier.predict(val_data)
unseen_predictions = best_classifier.predict(unseen_data)

# Calculate and print the accuracy on the validation and unseen data
val_accuracy = accuracy_score(val_labels, val_predictions)
unseen_accuracy = accuracy_score(unseen_labels, unseen_predictions)
print("Validation Accuracy:", val_accuracy)
print("Unseen Data Accuracy:", unseen_accuracy)



Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}
Validation Accuracy: 0.35484375
Validation Accuracy: 0.3825
Unseen Data Accuracy: 0.3945


Logistic Regression

In [36]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

# Load the data into a pandas DataFrame
df = pd.read_csv('traindata.txt', header=None, delimiter=',')
X = df.values

# Load the labels
y = pd.read_csv('trainlabels.txt', header=None).values.ravel()

# Normalize the input data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training, validation, and unseen data
X_train, X_unseen, y_train, y_unseen = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define your model using PyTorch's nn.Module
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(in_features=X.shape[1], out_features=64)
        self.dropout1 = nn.Dropout(0.2)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.dropout2 = nn.Dropout(0.2)
        self.bn2 = nn.BatchNorm1d(32)
        self.fc3 = nn.Linear(in_features=32, out_features=10)  

    def forward(self, x):
        x = self.bn1(torch.relu(self.fc1(x)))
        x = self.dropout1(x)
        x = self.bn2(torch.relu(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Convert the data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)
X_unseen = torch.tensor(X_unseen, dtype=torch.float32)
y_unseen = torch.tensor(y_unseen, dtype=torch.long)

# Convert the tensors into datasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
unseen_dataset = TensorDataset(X_unseen, y_unseen)

# Convert the datasets into dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
unseen_loader = DataLoader(unseen_dataset, batch_size=32, shuffle=False)

# Create an instance of your model
model = Classifier()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# Training loop
num_epochs = 50
no_improve = 0
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    print(f"Validation Loss: {val_loss}")
    
    if val_loss < best_loss:
        best_loss = val_loss
        no_improve = 0
    else:
        no_improve += 1
        if no_improve == 5:
            print('Early stopping.')
            break
    scheduler.step()

# Set the model to evaluation mode
model.eval()

# Predict the labels for the validation data
with torch.no_grad():
    val_predictions = model(X_val)
    _, val_predicted_labels = torch.max(val_predictions, 1)

# Calculate and print the accuracy on the validation data
val_accuracy = accuracy_score(y_val, val_predicted_labels)
print("Validation Accuracy:", val_accuracy)

# Evaluation on unseen data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in unseen_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the unseen data: %d %%' % (100 * correct / total))

Epoch [1/50], Loss: 2.3649472630023958
Validation Loss: 2.247571759223938
Epoch [2/50], Loss: 2.2175538611412047
Validation Loss: 2.190909938812256
Epoch [3/50], Loss: 2.1573812329769133
Validation Loss: 2.1537196016311646
Epoch [4/50], Loss: 2.1097389388084413
Validation Loss: 2.116653664112091
Epoch [5/50], Loss: 2.0499336910247803
Validation Loss: 2.0933871030807496
Epoch [6/50], Loss: 2.0078112477064134
Validation Loss: 2.051489760875702
Epoch [7/50], Loss: 1.977400752902031
Validation Loss: 2.0149634504318237
Epoch [8/50], Loss: 1.9455343747138978
Validation Loss: 1.9862100958824158
Epoch [9/50], Loss: 1.910859557390213
Validation Loss: 1.954639995098114
Epoch [10/50], Loss: 1.8744046711921691
Validation Loss: 1.9266823244094848
Epoch [11/50], Loss: 1.8166216152906418
Validation Loss: 1.9193699669837951
Epoch [12/50], Loss: 1.8208249682188034
Validation Loss: 1.9141188549995423
Epoch [13/50], Loss: 1.8179168301820754
Validation Loss: 1.910422031879425
Epoch [14/50], Loss: 1.806462

NN

In [55]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

train_data = pd.read_csv('traindata.txt', header=None)
train_labels = pd.read_csv('trainlabels.txt', header=None)
print(train_data.describe())

scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nn', MLPClassifier())
])

# Define the hyperparameters to tune
parameters = {
    'nn__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'nn__activation': ['relu', 'tanh'],
    'nn__learning_rate_init': [0.001, 0.01, 0.1],
    'nn__alpha': [0.0001, 0.001, 0.01]
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(train_data, train_labels.values.ravel())

# Print the best hyperparameters and the accuracy
print("Best parameters: ", grid_search.best_params_)
print("Accuracy: ", grid_search.best_score_)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Load the training data and labels
train_data = pd.read_csv('traindata.txt', header=None)
train_labels = pd.read_csv('trainlabels.txt', header=None)

# Split the data into training, validation, and unseen data
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(val_data, val_labels, test_size=0.5, random_state=42)

# Scale the data
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
val_data_scaled = scaler.transform(val_data)
test_data_scaled = scaler.transform(test_data)

# Train a neural network classifier
clf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', learning_rate_init=0.01, alpha=0.001, random_state=42)
clf.fit(train_data_scaled, train_labels.values.ravel())

# Evaluate the performance on the training, validation, and unseen data
train_acc = clf.score(train_data_scaled, train_labels)
val_acc = clf.score(val_data_scaled, val_labels)
test_acc = clf.score(test_data_scaled, test_labels)

print("Training accuracy: ", train_acc)
print("Validation accuracy: ", val_acc)
print("Test accuracy: ", test_acc)

Training accuracy:  0.9698333333333333
Validation accuracy:  0.2215
Test accuracy:  0.2475


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Load the training data and labels
train_data = pd.read_csv('traindata.txt', header=None)
train_labels = pd.read_csv('trainlabels.txt', header=None)

# Split the data into training, validation, and unseen data
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(val_data, val_labels, test_size=0.5, random_state=42)

# Scale the data
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
val_data_scaled = scaler.transform(val_data)
test_data_scaled = scaler.transform(test_data)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nn', MLPClassifier(max_iter=10000))
])

# Define the hyperparameters to tune
parameters = {
    'nn__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (100, 50, 25)],
    'nn__activation': ['relu', 'tanh'],
    'nn__learning_rate_init': [0.001, 0.01, 0.1],
    'nn__alpha': [0.0001, 0.001, 0.01]
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(train_data, train_labels.values.ravel())

# Print the best hyperparameters and the accuracy
print("Best parameters: ", grid_search.best_params_)
print("Accuracy: ", grid_search.best_score_)

# Train a deep neural network classifier with the best hyperparameters
clf = MLPClassifier(**grid_search.best_params_, max_iter=10000, random_state=42)
clf.fit(train_data_scaled, train_labels.values.ravel())

# Evaluate the performance on the training, validation, and unseen data
train_acc = clf.score(train_data_scaled, train_labels)
val_acc = clf.score(val_data_scaled, val_labels)
test_acc = clf.score(test_data_scaled, test_labels)

print("Training accuracy: ", train_acc)
print("Validation accuracy: ", val_acc)
print("Test accuracy: ", test_acc)

Best parameters:  {'nn__activation': 'relu', 'nn__alpha': 0.01, 'nn__hidden_layer_sizes': (100, 100), 'nn__learning_rate_init': 0.01}
Accuracy:  0.30583333333333335


TypeError: MLPClassifier.__init__() got an unexpected keyword argument 'nn__activation'