In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
# Define the file paths
file1_path = "/content/drive/My Drive/data/train_data.csv"
file2_path = "/content/drive/My Drive/data/test_data.csv"
file3_path = "/content/drive/My Drive/data/train_labels.csv"

# # Load the data into Pandas DataFrames
# train_data = pd.read_csv(file1_path)
# test_data = pd.read_csv(file2_path)
# train_labels = pd.read_csv(file3_path)


In [4]:
train_data = pd.read_csv("/content/drive/My Drive/data/train_data.csv", header=None)
train_labels = pd.read_csv("/content/drive/My Drive/data/train_labels.csv", header=None)
test_data = pd.read_csv("/content/drive/My Drive/data/test_data.csv", header=None)

print('train_data.shape: {}'.format(train_data.shape))
print('train_labels.shape: {}'.format(train_labels.shape))
print('test_data.shape: {}'.format(test_data.shape))


train_data.shape: (160, 22036)
train_labels.shape: (160, 1)
test_data.shape: (40, 22036)


In [30]:
# imports:

from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [6]:
X = train_data
y = train_labels

print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  (160, 22036)
y shape:  (160, 1)


In [7]:
# Train/Test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=0)


In [8]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_val.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_val.shape)

X_train shape:  (112, 22036)
X_test shape:  (48, 22036)
y_train shape:  (112, 1)
y_test shape:  (48, 1)


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=90)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_val)

logreg = LogisticRegression(max_iter=5000)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
grid_search = GridSearchCV(logreg, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_pca, y_train[0].tolist())

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

# Train the final model with the best parameters
best_logreg = grid_search.best_estimator_
best_logreg.fit(X_train_pca, y_train[0].tolist())
test_acc = accuracy_score(y_val, best_logreg.predict(X_test_pca))
print(f"Test Accuracy: {test_acc:.3f}")

Best Parameters:  {'C': 0.01}
Best Score:  0.7596837944664031
Test Accuracy: 0.750


In [13]:
#y_train = y_train #ravel()
y_train.shape

(112, 1)

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

#USING THE LOGREG WITH THE BEST PARAMETERS:


pca = PCA(n_components=90)

# Define the logistic regression model
logreg = LogisticRegression(max_iter=5000, C=0.01)

# Adjusting the pipeline

pipeline = Pipeline([
     # Add any other transformer steps here
          ('pca', pca),
          ('classifier', OneVsRestClassifier(estimator=logreg))  # Final estimator
])


# Fitting the pipeline
pipeline.fit(X_train, y_train)

In [43]:
# Evaluate the best model on the validation set
val_predictions = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation accuracy: {:.2f}".format(val_accuracy))


Validation accuracy: 0.77


In [46]:
# Now we can use ovr_classifier.predict() to predict labels for the test data
predicted_labels = pipeline.predict(test_data)
predicted_labels

array([3, 0, 2, 0, 0, 2, 2, 3, 4, 0, 4, 3, 3, 3, 3, 3, 2, 1, 3, 4, 1, 4,
       1, 2, 2, 3, 0, 3, 0, 4, 1, 4, 1, 0, 4, 4, 2, 4, 0, 0])

In [None]:
test_accuracy = accuracy_score(test_data, predicted_labels)
print("Test accuracy:", test_accuracy)