In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

## Read Data

In [2]:
CSV_PATH = "https://docs.google.com/spreadsheets/d/e/2PACX-1vROupcEcGFWafl16RmdNcSg7J3ZfCyD1socrrhGBwE0JBD_G7GN7r8YvYKSvyQzsxRW19MYpLkRClrU/pub?gid=0&single=true&output=csv"

In [3]:
df = pd.read_csv(CSV_PATH)

In [4]:
X = df.drop(['type', 'y'], axis=1) #[df.columns[1:10]]
y = df['y']

kf = KFold(n_splits=3, shuffle=True)

matric = 'f1_micro'

In [5]:
X.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,...,x61,x62,x63,x64,x65,x66,x67,x68,x69,x70
0,0.05,0.82,0.47,0.0,0.0,17.8,0.8,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.05,0.82,0.47,0.0,0.0,17.8,0.8,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.04,0.6,0.3,0.0,0.0,17.1,4.2,0.0,0.3,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.04,0.6,0.3,0.0,0.0,17.1,4.2,0.0,0.3,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.07,1.28,0.66,0.0,0.0,18.3,8.6,0.0,0.4,0.0,...,0,0,0,0,0,0,0,0,0,0


## Learning

In [6]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10000)

cv_results = cross_validate(model, X, y, cv=kf, scoring=matric)

#print(cv_results)
#print('F1 all :', cv_results['test_score'])
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.8632478632478633


In [8]:
# ANN

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(5,3), activation='relu', max_iter=1000)

cv_results = cross_validate(model, X, y, cv=kf, scoring=matric)

#print(cv_results)
#print('F1 all :', cv_results['test_score'])
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.3974358974358974


In [9]:
# D Tree

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

cv_results = cross_validate(model, X, y, cv=kf, scoring=matric)

#print(cv_results)
#print('F1 all :', cv_results['test_score'])
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.8547008547008547


In [10]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=101)

cv_results = cross_validate(model, X, y, cv=kf, scoring=matric)

#print(cv_results)
#print('F1 all :', cv_results['test_score'])
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.8760683760683761


In [7]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=9)

cv_results = cross_validate(model, X, y, cv=kf, scoring=matric)

#print(cv_results)
#print('F1 all :', cv_results['test_score'])
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.8504273504273504


In [17]:
tuned_knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=1, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=150, p=2,
                     weights='distance')
cv_results = cross_validate(tuned_knn, X, y, cv=kf, scoring=matric)
print('F1 : ', np.mean(cv_results['test_score']))

F1 :  0.8675213675213675


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier

CSV_PATH = "https://docs.google.com/spreadsheets/d/e/2PACX-1vROupcEcGFWafl16RmdNcSg7J3ZfCyD1socrrhGBwE0JBD_G7GN7r8YvYKSvyQzsxRW19MYpLkRClrU/pub?gid=0&single=true&output=csv"
df = pd.read_csv(CSV_PATH)
X = df.drop(['type', 'y'], axis=1)
y = df['y']

kf = KFold(n_splits=3, shuffle=True)
metric = 'f1_micro'

# Define hyperparameters to tune
hyperparameters = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

best_f1_score = 0.0
best_model = None

# Perform hyperparameter tuning
for n_estimators in hyperparameters['n_estimators']:
    for max_depth in hyperparameters['max_depth']:
        for min_samples_split in hyperparameters['min_samples_split']:
            for min_samples_leaf in hyperparameters['min_samples_leaf']:
                for max_features in hyperparameters['max_features']:
                    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                                   min_samples_split=min_samples_split,
                                                   min_samples_leaf=min_samples_leaf, max_features=max_features)

                    cv_results = cross_validate(model, X, y, cv=kf, scoring=metric)
                    average_f1_score = np.mean(cv_results['test_score'])

                    if average_f1_score > best_f1_score:
                        best_f1_score = average_f1_score
                        best_model = model

# Print the best F1 score and the corresponding model
print('Best F1 Score:', best_f1_score)
print('Best Model:', best_model)


Best F1 Score: 0.9102564102564102
Best Model: RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=200)


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

CSV_PATH = "https://docs.google.com/spreadsheets/d/e/2PACX-1vROupcEcGFWafl16RmdNcSg7J3ZfCyD1socrrhGBwE0JBD_G7GN7r8YvYKSvyQzsxRW19MYpLkRClrU/pub?gid=0&single=true&output=csv"
df = pd.read_csv(CSV_PATH)
X = df.drop(['type', 'y'], axis=1)
y = df['y']

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

kf = KFold(n_splits=3, shuffle=True)
metric = 'f1_micro'

# Define hyperparameters to tune
hyperparameters = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

best_f1_score = 0.0
best_model = None

# Perform hyperparameter tuning
for n_estimators in hyperparameters['n_estimators']:
    for max_depth in hyperparameters['max_depth']:
        for min_samples_split in hyperparameters['min_samples_split']:
            for min_samples_leaf in hyperparameters['min_samples_leaf']:
                for max_features in hyperparameters['max_features']:
                    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                                   min_samples_split=min_samples_split,
                                                   min_samples_leaf=min_samples_leaf, max_features=max_features)

                    cv_results = cross_validate(model, X_resampled, y_resampled, cv=kf, scoring=metric)
                    average_f1_score = np.mean(cv_results['test_score'])

                    if average_f1_score > best_f1_score:
                        best_f1_score = average_f1_score
                        best_model = model

# Print the best F1 score and the corresponding model
print('Best F1 Score:', best_f1_score)
print('Best Model:', best_model)


Best F1 Score: 0.9648363332573858
Best Model: RandomForestClassifier(max_depth=15, max_features='log2', n_estimators=400)
