______

# Top

* [Setup](#Setup)
* [Download datasets](#Download-datasets)
* [Feature selection](#Feature-selection)
    * [Above average threshold cut](#Above-average-threshold-cut)
    * [RFECV cut](#RFECV-cut)
* [Hyperparameter tuning](#Hyperparameter-tuning)
    * [HP for Random Forests](#HP-for-Random-Forests)
    * [HP for SVMs](#HP-for-SVMs)

[Bottom](#Bottom)

_____

# Setup

[Back to Top](#Top)

In [None]:
##
##/////////////////////////////////////
## Imports and Util

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, StratifiedGroupKFold, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score
from sklearn.feature_selection import RFECV, SequentialFeatureSelector

from hyperopt import hp, tpe, fmin

import time
import os

base_directory=os.getcwd()
class_names = ['Closed Forest', 'Open Forest', 'Mangrove', 'Savanna', 'Cashew', 'Non-Forest', 'Water']

In [None]:
##
##/////////////////////////////////////
## Auxiliary functions

def transform_y_2classes(y):
    """
    Transforms a y labeled array/Series from the 7 classes, where 5 is cashew, into a 2 labeled Series where 1 cashew, 2 non-cashew
    """
    y_update = pd.Series(y == 5, dtype="int")
    y_update.loc[y_update==0] = 2
    return y_update

##############################################

def get_feature_importances(rf, X_train):
    """
    Gets feature importances for a training dataset using a trained Random Forest Classifier or any other algorithm with a feature_importances_ attribute
    """
    feature_importances = rf.feature_importances_
    feature_names = X_train.columns
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    feature_importance_df.reset_index(inplace=True)
    return feature_importance_df

##############################################

def categorize_feature(feature_name):
    """
    Features in our project came from 3 different sources: direct band features, spatial features (GLCM) or temporal features (CCDC).
    Determines the family of a feature based on its feature name.
    """
    parts = feature_name.split('_')
    if len(parts) == 1:
        return 'band'
    elif len(parts) == 2 and parts[1].isupper():
        return 'temporal'
    elif len(parts) == 2 and parts[1].islower():
        return 'spatial'
    else:
        return 'unknown'

##############################################

def categorize_suffix(feature_name):
    """
    For a feature name that contains a "_", splits the name based on that character
    """
    if '_' in feature_name:
        parts = feature_name.split('_')
        return parts[1]
    else:
        return feature_name

##############################################

def plot_n_top_bottom(feature_importance_df, n=100, plot_band_temporal_spatial_count=True, plot_suffix_count=True):
    """
    Creates a plot regarding the nature/family of the top and bottom features according to a feature score, stored in feature_importance_df.
    """
    if feature_importance_df.shape[0] < n:
        n = feature_importance_df.shape[0]
    
    #plot a barplot with the indication of the nature of features according: being spatial, temporal, or a direct Sentinel-2 band
    if plot_band_temporal_spatial_count:
        df = feature_importance_df.copy(deep=True)
        df['Category'] = df['Feature'].apply(categorize_feature)
        #sort df based on feature importance column
        df_sorted = df.sort_values(by='Importance', ascending=False)
        top_n = df_sorted.head(n)
        bottom_n = df_sorted.tail(n)
        
        #plotting counts
        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
        top_n['Category'].value_counts().plot(kind='bar', ax=axs[0], color='skyblue')
        axs[0].set_title(f'Top {n} Important Features')
        bottom_n['Category'].value_counts().plot(kind='bar', ax=axs[1], color='salmon')
        axs[1].set_title(f'Bottom {n} Important Features')
        plt.tight_layout()
        plt.show()

    #plot a barplot with the direct suffixes of the features
    if plot_suffix_count:
        df = feature_importance_df.copy(deep=True)
        df['Category'] = df['Feature'].apply(categorize_suffix)
        #select just the top and bottom n
        df_sorted = df.sort_values(by='Importance', ascending=False)
        top_n = df_sorted.head(n)
        bottom_n = df_sorted.tail(n)

        #plots
        fig, axs = plt.subplots(1, 2, figsize=(16, 6))
        
        top_n['Category'].value_counts().plot(kind='bar', ax=axs[0], color='skyblue')
        axs[0].set_title(f'Top {n} Important Features')
        axs[0].set_xticklabels(axs[0].get_xticklabels(), rotation=45, ha='right')
        
        bottom_n['Category'].value_counts().plot(kind='bar', ax=axs[1], color='salmon')
        axs[1].set_title(f'Bottom {n} Important Features')
        axs[1].set_xticklabels(axs[1].get_xticklabels(), rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
    return;

##############################################

def find_common_and_unique_strings(list1, list2):
    """
    For two lists of strings, identify the common strings and those unique to each list.
    """
    set1 = set(list1)
    set2 = set(list2)
    
    #find common strings
    common_strings = set1.intersection(set2)
    
    #find strings appearing only in list1 or list2
    unique_to_list1 = set1.difference(set2)
    unique_to_list2 = set2.difference(set1)
    
    return list(common_strings), list(unique_to_list1), list(unique_to_list2)

In [None]:
def balanced_accuracy_scorer(estimator, X_true, y_true):
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    return balanced_accuracy_score(y_true,y_pred)

def f1_cashew_scorer(estimator, X_true, y_true):
    """
    Return F1 score for cashew. Adapted to the 7-class system where cashew is class 5, or to the binary system where cashew is class 1
    """
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    if np.unique(y_true).shape[0] == 2:
        return f1_score(y_true,y_pred,average=None)[0] #assumes 1st class is cashew, 2nd is non      
    else:
        try:
            return f1_score(y_true,y_pred,average=None)[4] #in the 7 total classes, cashew is 5th
        except IndexError: #other number of classes, could be just 1, if cashew doesn't appear for example
            return -1

########################################################################################################

def plot_train_test_results(model, X_train, y_train, X_test, y_test, class_names):
    """
    Plots the train and test results for a given model following a given set of classes
    """
    
    model.fit(X_train,y_train)
    #different treatment for a cashew vs. non-cashew binary classification system, and for the eight-class system
    if np.unique(y_train).shape[0] == 2:
        labels_cm = np.array([0,1])
        labels_recall = [1]
    else:
        labels_cm = np.arange(1,8)
        labels_recall = [5]

    #train set results
    preds_train = model.predict(X_train)
    print("Ov. Accuracy train: ", np.round(model.score(X_train,y_train),3))
    print("Balanced Accuracy train: ", np.round(balanced_accuracy_scorer(model, X_train, y_train),3))
    print("Recall cashew train: ", np.round( metrics.recall_score(y_train, preds_train, labels=labels_recall, average='micro'), 3) )
    print("F1-score cashew train: ", np.round(f1_cashew_scorer(model, X_train, y_train),3))
    print("Every F1-score train: ", np.round(f1_score(y_train,preds_train,average=None),3))
    print("\n")
    cm = metrics.confusion_matrix(y_train,preds_train,labels=labels_cm)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=np.array(class_names))
    cm_display.plot()
    plt.xticks(rotation = 45)
    plt.show()
    
    #test set results
    preds = model.predict(X_test)
    print("Ov. Accuracy test: ", np.round(model.score(X_test,y_test),3))
    print("Balanced Accuracy test: ", np.round(balanced_accuracy_scorer(model, X_test, y_test),3))
    print("Recall cashew test: ", np.round( metrics.recall_score(y_test, preds, labels=labels_recall, average='micro'), 3) )
    print("F1-score cashew test: ", np.round(f1_cashew_scorer(model, X_test, y_test),3))
    print("Every F1-score test: ", np.round(f1_score(y_test,preds,average=None),3))
    cm = metrics.confusion_matrix(y_test,preds,labels=labels_cm)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=np.array(class_names))
    cm_display.plot()
    plt.xticks(rotation = 45)
    plt.show()
    
    return;

########################################################################################################

def hyperopt_get_best(X_train, y_train, scorer, max_evals=30, timeout=7200, space=None, just_ints=False):
    """
    Bayesian hyperparameter optimization strategy.
    Selects the best HP combination based on mean 5-fold cross validation score on the training dataset
    """
    def objective(params):
        clf = RandomForestClassifier(**params)
        return -np.mean(cross_val_score(clf, X_train, y_train, cv=5, n_jobs=3, scoring=scorer))

    #define feature space if not stipulated; the default is the space for the Random Forests
    if space is None:
        space = {
            'n_estimators': hp.uniformint("n_estimators", low=50, high=400, q=10),
            'max_depth': hp.uniformint("max_depth", low=1, high=15, q=3), 
            'max_features': hp.uniformint("max_features", low=3, high=250, q=7)
        }

    #run the optimizer
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                rstate=np.random.RandomState(42),
                max_evals=max_evals,
                timeout=timeout
               )

    #print the best hyperparameters
    print("Best hyperparameters:")
    print(best)

    #values are tipically stored in floats; change to ints if required
    if just_ints:
        integer_best = {key: int(value) for key, value in best.items()}
        return integer_best
    return best #returns best hyperparameter combination

In [None]:
def calculate_class_proportions(labels):
    """
    Given an array with the labels (ints), returns an array with the proportion of every class
    """
    unique, counts = np.unique(labels, return_counts=True)
    total_samples = len(labels)
    proportions = counts / total_samples
    return proportions

def kl_divergence(p, q):
    """
    Calculates Kullback-Leibler (KL) divergence between two distribution samples
    """
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))



########################################################################################################

def sgkf_to_holdout(sgkf,X,y,groups):
    '''
    Takes in a stratified grouped k-fold CV object, and from those k options, finds the best train-test split (with one fold going into testing and the remaining k-1 to training),
    that is, the split whose class proportions mostly resembles the global class proportions.
    Returns: the integer of the sgkf.split() iteration that matches this best split
    '''
    
    class_proportions = []

    #perform every possible split and calculate class proportions in each iteration
    for train_index, test_index in sgkf.split(X, y, groups):
        train_labels = y[train_index]
        test_labels = y[test_index]
        class_proportion = calculate_class_proportions(test_labels)
        class_proportions.append(class_proportion)
    #calculate the global class proportions
    global_class_proportions = calculate_class_proportions(y)
    #calculate Kullback-Leibler (KL) divergence and find the fold with the closest class proportions
    min_kl_divergence = float('inf')
    selected_fold = None
    #select split with smaller divergence to the global class proportion distribution
    for i, proportions in enumerate(class_proportions):
        kl_distance = kl_divergence(global_class_proportions, proportions)
        if kl_distance < min_kl_divergence:
            min_kl_divergence = kl_distance
            selected_fold = i
    
    return selected_fold
    
########################################################################################################

def multipleHoldouts(X,y,groups,n_iter=10):
    '''
    calls sgkf_to_holdout multiple times, using different sgkf states, to find different hypothesis of grouped stratified holdout.
    Returns: list of integers that describe the best iteration of sgkf.split for each sgkf
    '''
    best_iterations = []
    for i in range(n_iter):
        sgkf = StratifiedGroupKFold(n_splits=5,shuffle=True,random_state=i)
        best_iterations.append(sgkf_to_holdout(sgkf,X,y,groups))
    return best_iterations

########################################################################################################

def performanceSGH(model, X, y, groups, class_names, n_iter=10, plot_cm=True, prints=True, X_added_al=None, y_added_al=None):
    '''
    Evaluates perfomance for a certain sklearn model, with the stratified grouped holdout procedure.
    Outputs results dictionary. Optionally plots confusion matrix
    '''
    best_iterations = multipleHoldouts(X,y,groups,n_iter)
    unique_labels, label_counts = np.unique(y, return_counts=True)
    

    ovacc_train = []
    balacc_train = []
    recall_train = []
    f1_train = []
    
    ovacc_test = []
    balacc_test = []
    recall_test = []
    f1_test = []
    
    for i,best_iter in enumerate(best_iterations):
        sgkf = StratifiedGroupKFold(n_splits=5,shuffle=True,random_state=i)
        #use the selected fold as test set
        test_fold = list(sgkf.split(X, y, groups))[best_iter]
        train_index, test_index = test_fold
        xx_train, xx_val, yy_train, yy_val = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index] #substituted .values with .loc
        clusters_train = groups[train_index]
        clusters_test = groups[test_index]
        
        if X_added_al is not None:
            xx_train = pd.concat([xx_train, X_added_al],ignore_index=True)
            yy_train = pd.concat([yy_train,y_added_al], ignore_index=True)

        if prints:
            print("Iteration " + str(i) + ":")
    
        model.fit(xx_train,yy_train)

        if np.unique(yy_train).shape[0] == 2:
            labels_cm = np.array([0,1])
            labels_recall = [1]
        else:
            labels_cm = np.arange(1,8)
            labels_recall = [5]
        
        #train results
        preds_train = model.predict(xx_train)
        
        ovacc_train.append(np.round(model.score(xx_train,yy_train),3))
        balacc_train.append(np.round(balanced_accuracy_scorer(model, xx_train, yy_train),3))
        recall_train.append(np.round( metrics.recall_score(yy_train, preds_train, labels=labels_recall, average='micro'), 3))
        f1_train.append(np.round(f1_cashew_scorer(model,xx_train,yy_train),3))

        if prints:
            print("Ov. Accuracy train: ", ovacc_train[-1]) #accuracy
            print("Balanced Accuracy train: ", balacc_train[-1])
            print("Recall cashew train: ", recall_train[-1])
            print("F1-score cashew train: ", f1_train[-1])
            print("\n")
        
        #test results
        preds = model.predict(xx_val)
    
        ovacc_test.append(np.round(model.score(xx_val,yy_val),3))
        balacc_test.append(np.round(balanced_accuracy_scorer(model, xx_val, yy_val),3))
        recall_test.append(np.round( metrics.recall_score(yy_val, preds, labels=labels_recall, average='micro'), 3))
        f1_test.append(np.round(f1_cashew_scorer(model, xx_val, yy_val),3))

        if prints:
            print("Ov. Accuracy test: ", ovacc_test[-1]) #accuracy
            print("Balanced Accuracy test: ", balacc_test[-1])
            print("Recall cashew test: ", recall_test[-1])
            print("F1-score cashew test: ", f1_test[-1])
            print("\n")
            
        #plot confusion matrix
        if plot_cm:
            cm = metrics.confusion_matrix(yy_val,preds,labels=labels_cm)
            cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=np.array(class_names))#, display_labels = ["Morreu","Sobreviveu"])
            cm_display.plot()
            plt.xticks(rotation = 45)
            plt.show()
        
    #show tabulates
    if prints:
        headers=["Ov. Accuracy", "Bal. Accuracy", "Recall Cashew", "F1-score Cashew"]
        print("Train \n")
        print(tabulate(np.round( np.array([ovacc_train, balacc_train, recall_train, f1_train]).transpose() ,3), headers=headers))
        print("\n Test \n")
        print(tabulate(np.round( np.array([ovacc_test, balacc_test, recall_test, f1_test]).transpose() ,3), headers=headers))

    final_dict = {"ovacc_train": ovacc_train, "balacc_train": balacc_train, "recall_train": recall_train, "f1_train": f1_train,
                  "ovacc_test": ovacc_test, "balacc_test": balacc_test, "recall_test": recall_test, "f1_test": f1_test
                 }
    return final_dict

sgkf = StratifiedGroupKFold(n_splits=5)

______________

# Download datasets

[Back to top.](#Top)

In [None]:
merge = os.path.join(base_directory, "") #full random sampling dataset
merged_train = os.path.join(base_directory, "") #RS train set
merged_test = os.path.join(base_directory, "") #RS test set

#entire RS dataset

X = merge.drop(columns=["C_ID_1", "x", "y", "groupID"]) #C_ID_1 equivalent to Class; groupID equivalent to polygonID
y = merge["C_ID_1"]
groups = merge["groupID"]

y_update = pd.Series(y == 5, dtype="int")
y_update.loc[y_update==0] = 2
y_update
y = y_update

#train and test sets

X_train = merged_train.drop(columns=["C_ID_1","x","y"])
y_train = merged_train["C_ID_1"]
X_test = merged_test.drop(columns=["C_ID_1","x","y"])
y_test = merged_test["C_ID_1"]

y_train_update = pd.Series(y_train == 5, dtype="int")
y_train_update.loc[y_train_update==0] = 2
y_train = y_train_update

y_test_update = pd.Series(y_test == 5, dtype="int")
y_test_update.loc[y_test_update==0] = 2
y_test_update
y_test = y_test_update

#for visualization
class_names = ["Cashew", "Non-Cashew"]
labels_cm = np.arange(1,3)
labels_recall = [1]

_____

# Feature selection

[Back to Top](#Top)

________

## __Above average threshold cut__

Basically, use previous best RF and train it on the Random Sampling train set. Get feature_importances_ and their average value. Select only the features with _feature_importances__ above the average value.

[Back to Top](#Top)

In [None]:
# train previous best model

rf = RandomForestClassifier(n_estimators=98, max_depth=8, max_features=19, random_state=42) #previous best model
rf.fit(X_train,y_train)

#train results
preds_train = rf.predict(X_train)
print("Accuracy train: ", np.round(rf.score(X_train,y_train),3)) #accuracy
print("Recall cashew train: ", np.round( metrics.recall_score(y_train, preds_train, labels=labels_recall, average='micro'), 3) )
print("F1-score train: ", np.round(f1_score(y_train,preds_train,average=None),3))
print("\n")
cm = metrics.confusion_matrix(y_train,preds_train,labels=labels_cm)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=np.array(class_names))
cm_display.plot()
plt.xticks(rotation = 45)
plt.show()

#test results
preds = rf.predict(X_test)
print("Accuracy test: ", np.round(rf.score(X_test,y_test),3)) #accuracy
print("Recall cashew test: ", np.round( metrics.recall_score(y_test, preds, labels=labels_recall, average='micro'), 3) )
print("F1-score test: ", np.round(f1_score(y_test,preds,average=None),3))
cm = metrics.confusion_matrix(y_test,preds,labels=labels_cm)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,display_labels=np.array(class_names))
cm_display.plot()
plt.xticks(rotation = 45)
plt.show()

In [None]:
# get feature importances and perform cut based on average value

#feature importances
feature_importances = rf.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.reset_index(inplace=True)

print("Features above 1/n_feat: ", np.unique(feature_importances>1/feature_importances.shape[0], return_counts=True))

#visualize ordered feature importances and average value cutoff
plt.figure(figsize=(10, 60))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.axvline(x=1/360, color='r', linestyle='--')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.ylim(-1, 360)
plt.title('ALL Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
feature_importance_cashewnon = get_feature_importances(rf, X_train)

plot_n_top_bottom(feature_importance_cashewnon, n=180)

* __Above-average method selects 67 features__

In [None]:
%%time

#results with 67 features

columns = feature_importance_df.loc[feature_importance_df["Importance"] > 1/360]["Feature"]
print(columns.shape)
rf = RandomForestClassifier(n_estimators=98, max_depth=8, max_features=19, random_state=42)
just_aboveavg_results = performanceSGH(rf, X[columns], pd.Series(y), groups, class_names, 20, plot_cm=False, prints=False)

In [None]:
%%time

#results with 360 total features

columns = X.columns
print(columns.shape)
rf = RandomForestClassifier(n_estimators=98, max_depth=8, max_features=19, random_state=42)
just_every_results = performanceSGH(rf, X[columns], pd.Series(y), groups, class_names, 20, plot_cm=False, prints=False)

In [None]:
# visualization of results

results = [just_every_results, just_aboveavg_results]
results_names = ["Every feature", "Above avg importance"]

keys = list(results[0].keys())

plot_cols=2
plot_rows=(len(keys)+1)//2
plt.figure(figsize=(12,20))

for k,key in enumerate(keys):
    data = []
    for r, typeof_result in enumerate(results):
        data.append(typeof_result[key])
    plt.subplot(plot_rows,plot_cols,k+1)
    plt.boxplot(data, labels = results_names)
    plt.title(key)
    plt.title(key)
plt.tight_layout()    
plt.show()

_________________________

# RFECV cut

Performed with sklearn's _Recursive Feature Elimination through Cross Validation_ method.

[Back to Top](#Top)

In [None]:
%%time

def scorer(estimator, X_true, y_true):
    y_pred = estimator.predict(X_true)
    print(np.unique(y_pred))
    print(np.unique(y_true))
    return f1_score(y_true,y_pred,average=None)[4] #gets f1-score for cashew

estimator = RandomForestClassifier(n_estimators=98, max_depth=8, max_features=19, random_state=42) #use previous best model
selector = RFECV(estimator, step=10, cv=5, scoring=scorer, verbose=1, n_jobs=3) #recursive feature elimination method
selector = selector.fit(X_train, y_train)
print(selector.support_)
print(selector.ranking_)

In [None]:
#visualize RFECV results

n_scores = len(selector.cv_results_["mean_test_score"])
min_features_to_select=1
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(min_features_to_select, n_scores + min_features_to_select),
    selector.cv_results_["mean_test_score"],
    yerr=selector.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()

__________

# Hyperparameter tuning

[Back to Top](#Top)

___

## HP for Random Forests

[Back to Top](#Top)

* __Every feature__

In [None]:
columns = feature_importance_df["Feature"]
space = {
    'n_estimators': hp.uniformint("n_estimators", low=50, high=300, q=10),
    'max_depth': hp.uniformint("max_depth", low=1, high=15, q=3), 
    'max_features': hp.uniformint("max_features", low=2, high=columns.shape[0], q=7)
}
best_hp_every_train = hyperopt_get_best(X_train[columns], y_train, f1_cashew_scorer, max_evals=60, timeout=7200, space=space, just_ints=True)

* __Above average cutoff__

In [None]:
columns = feature_importance_df.loc[feature_importance_df["Importance"] > 1/360]["Feature"]
space = {
    'n_estimators': hp.uniformint("n_estimators", low=50, high=300, q=10),
    'max_depth': hp.uniformint("max_depth", low=1, high=15, q=3), 
    'max_features': hp.uniformint("max_features", low=2, high=columns.shape[0], q=7)
}
best_hp_aboveavg_train = hyperopt_get_best(X_train[columns], y_train, f1_cashew_scorer, max_evals=60, timeout=7200, space=space, just_ints=True)

* __With RFECV cutoff__

In [None]:
feature_names_rfecv
X_train_cut_rfecv = X_train[feature_names_rfecv]
X_test_cut_rfecv = X_test[feature_names_rfecv]

max_evals = 60
timeout = 12600
best_hp_rfecv = hyperopt_get_best(X_train_cut_rfecv,y_train, max_evals = max_evals, timeout = timeout, space=space) 

______

# HP for SVMs

[Back to Top](#Top)

In [None]:
# SVMs work much better with normalized data

X_rs_norm = get_normalized_X(X, norm)
X_rs_train_norm = get_normalized_X(X_train, norm)
X_rs_test_norm = get_normalized_X(X_test, norm)

* __Every feature__

In [None]:
columns = feature_importance_df["Feature"]
space = {
    'C': hp.loguniform("C", low=np.log(1e-3), high=np.log(1e1)),
    'gamma': hp.loguniform("gamma", low=np.log(1e-10), high=np.log(1)),
    'kernel': hp.choice('kernel',['linear', 'poly', 'rbf']),
    'degree':hp.choice('degree',[2,3,4]),
    'class_weight': hp.choice('class_weight', [None, "balanced"])
}
best_hp_every_train = hyperopt_get_best(X_rs_train_norm[columns], y_train, f1_cashew_scorer, max_evals=60, timeout=7200, space=space, just_ints=True)

* __Above average cutoff__

In [None]:
columns = feature_importance_df.loc[feature_importance_df["Importance"] > 1/360]["Feature"]
space = {
    'C': hp.loguniform("C", low=np.log(1e-3), high=np.log(1e1)),
    'gamma': hp.loguniform("gamma", low=np.log(1e-10), high=np.log(1)),
    'kernel': hp.choice('kernel',['linear', 'poly', 'rbf']),
    'degree':hp.choice('degree',[2,3,4]),
    'class_weight': hp.choice('class_weight', [None, "balanced"])
}
best_hp_aboveavg_train = hyperopt_get_best(X_rs_train_norm[columns], y_train, f1_cashew_scorer, max_evals=60, timeout=7200, space=space, just_ints=True)

* __With RFECV cutoffs__

In [None]:
feature_names_rfecv
X_train_cut_rfecv = X_rs_train_norm[feature_names_rfecv]
X_test_cut_rfecv = X_rs_test_norm[feature_names_rfecv]

max_evals = 60
timeout = 12600
best_hp_rfecv = hyperopt_get_best(X_train_cut_rfecv,y_train, max_evals = max_evals, timeout = timeout, space=space) 

___

# Bottom

[Back to top](#Top)