## Finding best model and hyper parameters

In [4]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd

def loadObservations(file_name):
    # Load observations from file
    observations = pd.read_csv(file_name)
    # Last column (classes)
    Y = observations.iloc[: , -1]
    # Droping lon/lat columns
    X = observations.iloc[:, 0:19]
    return X, Y

In [5]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [10,20],
            'kernel': ['rbf','linear']
        },
        'process': True  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [10,20,40,80,160],
            'criterion': ['gini','entropy','log_loss'],
            'max_features': ['auto','sqrt','log2']
        },
        'process': False  
    },
    'logistic_regression' : {
        'model': LogisticRegression(max_iter=35000),
        'params': {
            'C': [5,10],
            'solver': ['liblinear','lbfgs']
        },
        'process': True  
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {},
        'process': True  
    },
    'nearest_neighbor': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [5,10],
            'weights': ['uniform','distance']
        },
        'process': True  
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy','log_loss'],
            'max_features': ['auto','sqrt','log2']
        },
        'process': True  
    }     
}

observations_data = {
    'Balanced': "combined_joro_v2.csv",
    'Imbalanced': "combined_joro_imbalanced.csv"
}

In [None]:
# Tuning results
scores = []

for obs_name, file_name in observations_data.items():
    print(obs_name, file_name)
    X, Y = loadObservations(file_name)
    for model_name, mp in model_params.items():
        if mp['process'] == False:
            continue
        print(model_name)
        clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, n_iter=2, return_train_score=False)
        clf.fit(X, Y)
        scores.append({
            'obs_name': obs_name,
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
    
df = pd.DataFrame(scores,columns=['obs_name','model','best_score','best_params'])
df