### Import necessary Libraries

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from collections import Counter
from sklearn.model_selection import LearningCurveDisplay, learning_curve

### Load and save model

In [2]:
import pickle
# Save model
def save_model(file_name, model):
    with open('/kaggle/working/'+file_name,'wb') as f:
        pickle.dump(model,f)
    f.close()

# Load model
def load_model(file_name):
    with open('/kaggle/working/'+file_name,'rb') as f:
        model = pickle.load(f)
    f.close()
    return model

### Load data

In [None]:
df = pd.read_csv('/kaggle/input/balance-data/balanced.csv')

### Split train and test set

In [None]:
X = df.iloc[:,:-1].to_numpy()
Y = df.iloc[:,-1].to_numpy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=random.randint(1,999))
print(X_train.shape,X_test.shape)

### RF and SVM

#### Plot learning curve function

In [None]:
def plot_learning_curve(model):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model, X=X_train, y=Y_train,
                                                        cv=10, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        n_jobs=1)
    #
    # Calculate training and test mean and std
    #
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    #
    # Plot the learning curve
    #
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='o', markersize=5, label='Validation Accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title('Learning Curve')
    plt.xlabel('Training Data Size')
    plt.ylabel('Model accuracy')
    plt.grid()
    plt.legend(loc='lower right')
    plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(n_jobs=-1) 

parameters = {
    'n_estimators': [50, 100, 150, 200],
}

clf = GridSearchCV(rfc, parameters)
clf.fit(X_train, Y_train)
print(sorted(clf.cv_results_.keys()))


In [None]:
RF_clf = RandomForestClassifier()
plot_learning_curve(RF_clf)
RF_clf.fit(X_train, Y_train)
save_model('RF.pkl', RF_clf)

In [None]:
RF_clf_Y_pred = RF_clf.predict(X_test)
print(classification_report(Y_test, RF_clf_Y_pred))
ConfusionMatrixDisplay.from_predictions(Y_test, RF_clf_Y_pred, normalize='true',values_format='.2f', xticks_rotation='vertical')

In [None]:
SVM_clf = make_pipeline(StandardScaler(),PCA(n_components=20), SVC())
plot_learning_curve(SVM_clf)
SVM_clf.fit(X_train, Y_train)               
save_model('SVM.pkl', SVM_clf)

In [None]:
SVM_clf_Y_pred = SVM_clf.predict(X_test)
print(classification_report(Y_test, SVM_clf_Y_pred))
ConfusionMatrixDisplay.from_predictions(Y_test, SVM_clf_Y_pred, normalize='true',values_format='.2f', xticks_rotation='vertical')

### Cross eval

In [None]:
random_files = random.sample(range(21, 169), 5)
print("Used files: ", random_files)

cross_df = pd.concat([pd.read_csv(data_path+f'part-{j:05d}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv') for j in random_files])

cross_df = convert_to_category(cross_df)

cross_df.drop(high_corr_cols,axis=1, inplace=True)           
X_cross = cross_df.iloc[:,:-1].to_numpy()
Y_cross = cross_df.iloc[:,-1].to_numpy()


In [None]:
print(len(cross_df))

In [None]:
RF_clf_cross_Y_pred = RF_clf.predict(X_cross)
print(classification_report(Y_cross, RF_clf_cross_Y_pred))
ConfusionMatrixDisplay.from_predictions(Y_cross, RF_clf_cross_Y_pred, normalize='true',values_format='.2f', xticks_rotation='vertical')

In [None]:
SVM_clf_cross_Y_pred = SVM_clf.predict(X_cross)
print(classification_report(Y_cross, SVM_clf_cross_Y_pred))
ConfusionMatrixDisplay.from_predictions(Y_cross, SVM_clf_cross_Y_pred, normalize='true',values_format='.2f', xticks_rotation='vertical')