In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [2]:
# read the test and train data files
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

Label 2

In [3]:
train_2_df = train_df.iloc[:, :-2]
valid_2_df = valid_df.iloc[:, :-2]
test_2_df = test_df.iloc[:, 1:]

train_2_df.drop(columns=["label_1"], inplace=True)
valid_2_df.drop(columns=["label_1"], inplace=True)

In [4]:
train_2_df.isna().sum()

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
              ... 
feature_765      0
feature_766      0
feature_767      0
feature_768      0
label_2        480
Length: 769, dtype: int64

In [5]:
valid_2_df.isna().sum()

feature_1       0
feature_2       0
feature_3       0
feature_4       0
feature_5       0
               ..
feature_765     0
feature_766     0
feature_767     0
feature_768     0
label_2        14
Length: 769, dtype: int64

In [29]:
# splitting the test and train datasets into X and Y values
X_2_train= train_2_df.iloc[:,0:-1].values
Y_2_train = train_2_df.iloc[:,-1].values
X_2_valid = valid_2_df.iloc[:,0:-1].values
Y_2_valid = valid_2_df.iloc[:,-1].values
X_2_test = test_2_df.iloc[:, :].values

In [30]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
Y_2_train = imputer.fit_transform(Y_2_train.reshape(-1, 1)).flatten().astype(int)

In [31]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
Y_2_valid = imputer.fit_transform(Y_2_valid.reshape(-1, 1)).flatten().astype(int)

In [32]:
# scalling and fitting data
scaler = StandardScaler()
scaler.fit(X_2_train)

X_2_train = scaler.transform(X_2_train)
X_2_valid = scaler.transform(X_2_valid)
X_2_test = scaler.transform(X_2_test)

In [10]:
# compare models using cross validation
models = [RandomForestClassifier(), SVC(kernel='linear'), KNeighborsClassifier(n_neighbors=5)]

for model in models:
    cv_score = cross_val_score(model, X_2_train, Y_2_train, cv=5)
    mean_accuracy = round((sum(cv_score)/len(cv_score))*100,2)
    print("Mean accuracy % of the model: ", model, mean_accuracy)

Mean accuracy % of the model:  RandomForestClassifier() 34.65
Mean accuracy % of the model:  SVC(kernel='linear') 50.39
Mean accuracy % of the model:  KNeighborsClassifier() 49.45


In [67]:
# Use SVC since it has the highest accuracy
model = SVC()
model.fit(X_2_train, Y_2_train)

In [68]:
y_2_valid_pred = model.predict(X_2_valid)
y_2_test_pred = model.predict(X_2_test)
print(classification_report(Y_2_valid, y_2_valid_pred))

              precision    recall  f1-score   support

          22       0.88      0.81      0.84        36
          23       0.92      0.80      0.86        71
          24       0.85      0.87      0.86        46
          25       0.92      0.73      0.82        79
          26       0.66      0.97      0.79       115
          27       0.64      0.80      0.71        81
          28       0.95      0.65      0.77        60
          29       0.95      0.84      0.89        45
          30       0.98      0.83      0.90        48
          31       0.83      0.85      0.84        65
          32       1.00      0.64      0.78        11
          33       1.00      0.80      0.89        30
          34       1.00      0.91      0.95        11
          35       1.00      0.91      0.95        11
          36       1.00      1.00      1.00         8
          41       0.90      0.64      0.75        14
          61       1.00      0.89      0.94        19

    accuracy              

In [69]:
# Create a SelectKBest instance with f_classif scoring function and select top 2 features
k = 250
selector = SelectKBest(score_func=f_classif, k=k)

# Fit and transform the data
X_2_best_train = selector.fit_transform(X_2_train, Y_2_train)
X_2_best_valid = selector.transform(X_2_valid)
X_2_best_test = selector.transform(X_2_test)

In [70]:
X_2_best_train.shape

(28520, 250)

In [71]:
model.fit(X_2_best_train, Y_2_train)
y_2_pred_after = model.predict(X_2_best_valid)
print(classification_report(Y_2_valid, y_2_pred_after))

              precision    recall  f1-score   support

          22       0.85      0.81      0.83        36
          23       0.86      0.79      0.82        71
          24       0.87      0.87      0.87        46
          25       0.94      0.73      0.82        79
          26       0.66      0.98      0.79       115
          27       0.64      0.78      0.70        81
          28       0.95      0.60      0.73        60
          29       0.95      0.84      0.89        45
          30       0.93      0.81      0.87        48
          31       0.82      0.83      0.82        65
          32       1.00      0.64      0.78        11
          33       1.00      0.83      0.91        30
          34       0.91      0.91      0.91        11
          35       1.00      0.91      0.95        11
          36       1.00      1.00      1.00         8
          41       0.82      0.64      0.72        14
          61       1.00      0.89      0.94        19

    accuracy              

In [72]:
pca=PCA(0.99)
pca = pca.fit(X_2_train)

x_2_train_pca=pca.fit_transform(X_2_best_train)
x_2_valid_pca = pca.transform(X_2_best_valid)
x_2_test_pca = pca.transform(X_2_best_test)

In [73]:
# Use KNeighborsClassifier
model.fit(x_2_train_pca, Y_2_train)

y_2_pred_after = model.predict(x_2_valid_pca)
print(classification_report(Y_2_valid, y_2_pred_after))

              precision    recall  f1-score   support

          22       0.82      0.78      0.80        36
          23       0.86      0.76      0.81        71
          24       0.89      0.85      0.87        46
          25       0.92      0.70      0.79        79
          26       0.66      0.97      0.79       115
          27       0.61      0.79      0.69        81
          28       0.95      0.60      0.73        60
          29       0.93      0.87      0.90        45
          30       0.91      0.81      0.86        48
          31       0.83      0.82      0.82        65
          32       1.00      0.55      0.71        11
          33       1.00      0.83      0.91        30
          34       1.00      0.91      0.95        11
          35       1.00      0.91      0.95        11
          36       1.00      1.00      1.00         8
          41       0.82      0.64      0.72        14
          61       1.00      0.89      0.94        19

    accuracy              

In [74]:
x_2_train_pca.shape

(28520, 196)

In [75]:
param_grid = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']   # Kernel coefficient for 'rbf' kernel
}


In [76]:
# Create a GridSearchCV object with cross-validation (e.g., 5-fold cross-validation)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

In [77]:
# Fit the GridSearchCV instance to the training data
grid_search.fit(x_2_train_pca, Y_2_train)

In [78]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
# Evaluate the best model on the test set
y_2_pred_after = best_model.predict(x_2_valid_pca)
preds = best_model.predict(x_2_test_pca)

print("Best Hyperparameters:", best_params)
print(classification_report(Y_2_valid, y_2_pred_after))

In [26]:
data_frame = pd.DataFrame(preds, columns=["label_2"])
data_frame.to_csv(f"190253K_2.csv",na_rep='')