# **Importing the database**

In [15]:
import os
import tarfile
from six.moves import urllib

#Variables
file_path = os.path.join(".")
file_name = "OlsenVeg.csv"
file_url ="https://raw.githubusercontent.com/octokami/PredictiveBiomeModelling/master/OlsenVeg.csv"

#Import
def fetch_file_data(file_url, file_path):
  os.makedirs(file_path, exist_ok=True)
  csv_path = os.path.join(file_path, file_name)
  urllib.request.urlretrieve(file_url, csv_path)
fetch_file_data(file_url, file_path)


# **Pre processing the data**

In [16]:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

    #Minimum number of sites for a biome 
    min_sites=10
    #Removing taxas where the sum for the whole database is less than the threshold
    threshold=3
    #Test size for the split
    test_size=0.1
    # Set random seed for repeatability
    seed=42
    #For the StratifiedKFold
    n_folds = 10


    # Load the csv using pandas
    df = pd.read_csv('OlsenVeg.csv')

    # Filter rows for biomes that occur in less than 10 sites
    df = df.groupby("BIO_N").filter(lambda x: len(x) >= min_sites)

    ##Categoricals can only take on only a limited, and usually fixed, number of possible values (categories). In contrast to statistical categorical variables, a Categorical might have an order, but numerical operations (additions, divisions, …) are not possible.
    cat = pd.Categorical(df.BIO_N)
    # Convert category names to numbers
    y = cat.codes
    # Store names of the categories (i.e. biomes)
    labels = cat.categories
    
    # Remove non-pollen columns; all rows, starting at 9th column onwards
    pollen_only = df.iloc[:, 9:] 

    # Convert data to a matrix
    ##pandas.DataFrame.values: Only the values in the DataFrame will be returned, the axes labels will be removed.
    pollen_matrix = pollen_only.values

    # Rename input data (pollen) to 'x', output data (biomes) to 'y'
    x = pollen_matrix.copy()

    #Removing taxas
    x = x[:,(x > threshold).sum(axis=0) != 0]

## **Data Scaling**

In [17]:
# Scale values so between [0,1]s
#x /= 100

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(copy=False)

#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler(copy=False)

x = scaler.fit_transform(x)

# **Splitting, Fitting and Predicting**

In [18]:
# Split into train/test sets. Training set = 90%
##stratify parameter will preserve the proportion of target as in original dataset, in the train and test datasets as well. Often, we want to preserve the dataset proportions for better prediction and reproduceability of results
x, x_test, y, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, stratify=y)

## **SMOTE**

In [19]:
#!pip install imbalanced-learn
# you only need to run it once, this can be deleted afterwards

In [20]:
#from imblearn.over_sampling import SMOTE

# creating a dataset with SMOTE application 
#smt = SMOTE(random_state=seed)

#x, y = smt.fit_resample(x, y)

max_features sqrt Number of features to consider when looking for the best split
min_samples_split 0.007066305 Minimum number of samples required to split an internal node
class_weight balanced_subsample Weights associated with classes
criterion entropy Function measuring the quality of a split
n_estimator 98

In [21]:
from sklearn.ensemble import RandomForestClassifier
pipe = RandomForestClassifier(random_state=seed, max_features='sqrt', min_samples_split=0.007066305, class_weight='balanced_subsample', criterion='entropy', n_estimators=98)
pipe.fit(x, y)
#lin_reg.intercept_, lin_reg.coef_
##predict_proba: Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute classes_
y_proba = pipe.predict_proba(x_test)
##numpy.argmax(a, axis=None, out=None): Returns the indices of the maximum values along an axis. Axis= is the highest for each column.
y_pred = y_proba.argmax(axis=1)


In [22]:
    from sklearn import metrics
    from functools import partial
    # Calculate confusion matrix on test set data
    confusion_mat = metrics.confusion_matrix(y_test, y_pred)

    accuracy = metrics.accuracy_score(y_test,y_pred)
    kappa = metrics.cohen_kappa_score(y_test,y_pred)
    #F1
    f1_micro = metrics.f1_score(y_test,y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test,y_pred, average='macro')
    f1_weighted = metrics.f1_score(y_test,y_pred, average='weighted')

    #Precision
    precision_micro = metrics.precision_score(y_test,y_pred, average='micro')
    precision_macro = metrics.precision_score(y_test,y_pred, average='macro')
    precision_weighted = metrics.precision_score(y_test,y_pred, average='weighted')

    #Recall
    recall_micro = metrics.recall_score(y_test,y_pred, average='micro')
    recall_macro = metrics.recall_score(y_test,y_pred, average='macro')
    recall_weighted = metrics.recall_score(y_test,y_pred, average='weighted')

    test_metrics=[accuracy, kappa, f1_macro, f1_micro, f1_weighted, precision_macro, precision_micro, precision_weighted, recall_macro, recall_micro, recall_weighted]
    test_metrics

  _warn_prf(average, modifier, msg_start, len(result))


[0.8558558558558559,
 0.8006286484059273,
 0.6807407407407406,
 0.8558558558558559,
 0.8503503503503503,
 0.6713351016799292,
 0.8558558558558559,
 0.8510096303199752,
 0.6956221198156682,
 0.8558558558558559,
 0.8558558558558559]