In [10]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [11]:
# read the test and train data files
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

In [12]:
def to_csv(predictions_old, predictions_new, reduced_X_train, label_no):
    data = []
    cols = ["Predicted labels before feature engineering", "Predicted labels after feature engineering", "No of new features"]

    for index in range(1, 257):
        cols.append(f"new_feature_{index}")
    
    for index, pred in enumerate(predictions_old):
        data.append([predictions_old[index], predictions_new[index]])
        
    final_no_of_features = reduced_X_train.shape[1]
    for index, row in enumerate(data):
        data[index].append(final_no_of_features)
        if index < len(reduced_X_train):
            data[index] = np.concatenate((data[index], reduced_X_train[index]))

    blank_array = np.empty((1, (256 - final_no_of_features)))
    blank_array.fill(np.nan)
    for index,row in enumerate(data):
        data[index] = np.concatenate((data[index], blank_array[0]))

    data_frame = pd.DataFrame(data, columns=cols)
    data_frame.to_csv(f"190253K_{label_no}.csv",na_rep='')

Lable 1

In [13]:
train_1_df = train_df.iloc[:,:-3]
valid_1_df = valid_df.iloc[:, :-3]
test_1_df = test_df.iloc[:, 1:]

In [46]:
train_1_df.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_765    0
feature_766    0
feature_767    0
feature_768    0
label_1        0
Length: 769, dtype: int64

In [47]:
valid_1_df.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_765    0
feature_766    0
feature_767    0
feature_768    0
label_1        0
Length: 769, dtype: int64

In [14]:
# splitting the test and train datasets into X and Y values
X_1_train= train_1_df.iloc[:,0:-1].values
Y_1_train = train_1_df.iloc[:,-1].values
X_1_valid = valid_1_df.iloc[:,0:-1].values
Y_1_valid = valid_1_df.iloc[:,-1].values
X_1_test = test_1_df.iloc[:,:].values

In [15]:
# scalling and fitting data
scaler = StandardScaler()
scaler.fit(X_1_train)

X_1_train = scaler.transform(X_1_train)
X_1_valid = scaler.transform(X_1_valid)
X_1_test = scaler.transform(X_1_test)

In [81]:
# compare models using cross validation
models = [RandomForestClassifier(), SVC(kernel='linear'), KNeighborsClassifier(n_neighbors=5)]

for model in models:
    cv_score = cross_val_score(model, X_1_train, Y_1_train, cv=5)
    mean_accuracy = round((sum(cv_score)/len(cv_score))*100,2)
    print("Mean accuracy % of the model: ", model, mean_accuracy)

Mean accuracy % of the model:  RandomForestClassifier() 86.08
Mean accuracy % of the model:  SVC(kernel='linear') 92.48
Mean accuracy % of the model:  KNeighborsClassifier() 82.72


In [16]:
# Use SVC since it has the highest accuracy
model = SVC(kernel='linear')
model.fit(X_1_train, Y_1_train)

In [17]:
y_1_valid_pred = model.predict(X_1_valid)
y_1_test_pred = model.predict(X_1_test)
print(classification_report(Y_1_valid, y_1_valid_pred))

              precision    recall  f1-score   support

           1       0.87      1.00      0.93        13
           2       1.00      0.89      0.94         9
           3       0.86      1.00      0.92        12
           4       1.00      0.94      0.97        16
           5       1.00      0.94      0.97        18
           6       1.00      0.89      0.94         9
           7       0.79      0.88      0.83        17
           8       1.00      1.00      1.00        14
           9       1.00      0.91      0.95        11
          10       0.89      1.00      0.94         8
          11       0.95      1.00      0.97        19
          12       1.00      1.00      1.00         7
          13       1.00      0.91      0.95        11
          14       0.93      0.87      0.90        15
          15       1.00      0.94      0.97        17
          16       1.00      1.00      1.00        14
          17       1.00      0.93      0.96        14
          18       0.90    

In [18]:
# Create a SelectKBest instance with f_classif scoring function and select top 2 features
k = 500
selector = SelectKBest(score_func=f_classif, k=k)

# Fit and transform the data
X_1_train = selector.fit_transform(X_1_train, Y_1_train)
X_1_valid = selector.transform(X_1_valid)
X_1_test = selector.transform(X_1_test)

In [19]:
X_1_train.shape

(28520, 500)

In [20]:
model.fit(X_1_train, Y_1_train)
y_1_pred_after = model.predict(X_1_valid)
print(classification_report(Y_1_valid, y_1_pred_after))

              precision    recall  f1-score   support

           1       0.80      0.92      0.86        13
           2       1.00      0.89      0.94         9
           3       0.79      0.92      0.85        12
           4       1.00      0.94      0.97        16
           5       0.80      0.89      0.84        18
           6       1.00      0.89      0.94         9
           7       0.79      0.88      0.83        17
           8       1.00      0.93      0.96        14
           9       1.00      0.91      0.95        11
          10       1.00      1.00      1.00         8
          11       1.00      0.95      0.97        19
          12       1.00      1.00      1.00         7
          13       1.00      0.91      0.95        11
          14       0.81      0.87      0.84        15
          15       0.94      0.94      0.94        17
          16       0.93      1.00      0.97        14
          17       0.93      0.93      0.93        14
          18       0.95    

In [21]:
pca=PCA(0.85)
pca = pca.fit(X_1_train)

x_1_train_pca=pca.fit_transform(X_1_train)
x_1_valid_pca = pca.transform(X_1_valid)
x_1_test_pca = pca.transform(X_1_test)

In [22]:
# Use SVC
model.fit(x_1_train_pca, Y_1_train)

y_1_pred_after = model.predict(x_1_valid_pca)
print(classification_report(Y_1_valid, y_1_pred_after))

              precision    recall  f1-score   support

           1       0.85      0.85      0.85        13
           2       0.89      0.89      0.89         9
           3       0.71      0.83      0.77        12
           4       0.88      0.94      0.91        16
           5       0.82      0.78      0.80        18
           6       1.00      0.78      0.88         9
           7       0.75      0.88      0.81        17
           8       0.83      0.71      0.77        14
           9       1.00      0.82      0.90        11
          10       0.80      1.00      0.89         8
          11       0.90      0.95      0.92        19
          12       1.00      1.00      1.00         7
          13       0.83      0.91      0.87        11
          14       0.92      0.80      0.86        15
          15       1.00      0.76      0.87        17
          16       0.65      0.93      0.76        14
          17       0.93      0.93      0.93        14
          18       0.94    

In [23]:
x_1_train_pca.shape

(28520, 108)

In [24]:
param_grid = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']   # Kernel coefficient for 'rbf' kernel
}

In [25]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

In [26]:
# Fit the GridSearchCV instance to the training data
grid_search.fit(x_1_train_pca, Y_1_train)

In [27]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [28]:
# Evaluate the best model on the test set
y_1_pred_after = best_model.predict(x_1_valid_pca)

print("Best Hyperparameters:", best_params)
print(classification_report(Y_1_valid, y_1_pred_after))


Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           1       0.86      0.92      0.89        13
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00        16
           5       0.94      0.89      0.91        18
           6       1.00      1.00      1.00         9
           7       0.84      0.94      0.89        17
           8       1.00      0.86      0.92        14
           9       1.00      0.91      0.95        11
          10       1.00      1.00      1.00         8
          11       1.00      0.95      0.97        19
          12       1.00      1.00      1.00         7
          13       1.00      0.82      0.90        11
          14       1.00      0.87      0.93        15
          15       1.00      0.88      0.94        17
          16       1.00      0.93      0.96        14
          17  

In [30]:
preds = best_model.predict(x_1_test_pca)

In [31]:
data_frame = pd.DataFrame(preds, columns=["label_1"])
data_frame.to_csv(f"190253K_1.csv",na_rep='')