# Import data, raw data analysis, random forest, boosting, hyperparameter tuning, Grid Search

In [None]:
import pandas as pd
import numpy as np
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
!pip install imblearn
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import confusion_matrix
import seaborn as sns
import dalex as dx
from dataprep.eda import plot, plot_correlation, create_report, plot_missing

# Import data 
 Drop columns 
 Drop NA

In [None]:
data = pd.read_csv('../Data/Frog_Orccurence_Merged.csv') # Read in the data
# Drop the columns with low variance  
df = data.drop(['Unnamed: 0','swe','crs','ppt','srad','def','pet','tmax','tmin','coord','min_lon','max_lon','min_lat','max_lat','ppt_station_influence','tmax_station_influence','tmin_station_influence','vap_station_influence','samples_count'], axis=1)
df.dropna(inplace=True) # Drop the rows with missing values
data = df # Rename the dataframe
data['frog_c'] = data['frog_count'] # Rename the column
del data['frog_count'] # Delete the old column

## Report over raw data, including pearsons R 

 Takes about 1 minute to run 

In [None]:
create_report(data)

### Labelencoder

In [None]:
data['frog_c'] = pd.qcut(data.frog_c, q=[0, .1,.2,.3, .4,.5, .6,.7, .8,.9, 1], duplicates='drop') # Create a new column with the quantiles
data['frog_c'].value_counts() # Check the distribution of the new column

data['frog_count'] = LabelEncoder().fit_transform(data['frog_c']) # Encode the new column
data # Check the new dataframe
del data['frog_c'] # Delete the old column

### Outlier capping 

In [None]:
# Outliers capping for treating outliers
def outlier_capping(x):
    x = x.clip(upper = x.quantile(0.99)) # Cap the outliers
    x = x.clip(lower = x.quantile(0.01)) # Cap the outliers
    return x # Return the capped values

data.iloc[:,0:-7]=data.iloc[:,0:-7].apply(lambda x: outlier_capping(x)) # Frog count is not capped as it is the target variable
sns.stripplot(data['frog_count']) # Check the distribution of the new column 

### Define the target variables and features 

In [None]:
y= data['frog_count'] # Define the target variable
x = data.drop(['frog_count'], axis=1) # Define the features

### Adasyn 

In [None]:
rus = ADASYN() # Define the resampling method
# fit predictor and target varialbe
x_rus, y_rus = rus.fit_resample(x, y) # Resample the data
print('original dataset shape:', Counter(y)) # Check the distribution of the target variable
print('Resample dataset shape', Counter(y_rus)) # Check the distribution of the target variable

### Split the data

In [None]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_rus, y_rus, test_size=0.25, random_state=0) 

In [None]:
# Check the shapes of the training and test sets
y_rus.value_counts().plot(kind='bar')

### Minmaxscaler 

In [None]:
mm_scaler = MinMaxScaler() # Define the scaler
X_train_mm = mm_scaler.fit_transform(x_train) # Scale the training data
X_test_mm = mm_scaler.transform(x_test) # Scale the test data

# VIF Test

In [None]:
# Generating VIF scores & dropping variables having high VIF to avoid multicollinearity
F = pd.DataFrame(X_train_mm,columns=x.columns).drop(['vap'],axis=1) # Define the dataframe
# VIF dataframe
vif_data = pd.DataFrame() # Define the dataframe
vif_data["feature"] = F.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(F.values, i) for i in range(len(F.columns))] # Calculate the VIF

print(vif_data) # Print the VIF dataframe

In [None]:

X_train_mm=pd.DataFrame(X_train_mm,columns=x.columns).drop(['vap'],axis=1) # Define the dataframe
X_train_mm=X_train_mm.values # Convert the dataframe to a numpy array
X_test_mm=pd.DataFrame(X_test_mm,columns=x.columns).drop(['vap'],axis=1) # Define the dataframe
X_test_mm=X_test_mm.values # Convert the dataframe to a numpy array

data = data.drop(['vap'],axis=1) # Define the dataframe

# Random forest classifier

In [None]:
rf = RandomForestClassifier(n_estimators=1000, oob_score=True, random_state=123456) # Define the random forest classifier
rf.fit(X_train_mm, y_train) # Fit the model

In [None]:
predicted = rf.predict(X_test_mm) # Predict the test data
accuracy = accuracy_score(y_test, predicted) # Calculate the accuracy

PredTrainSet1 = rf.predict(X_train_mm) #  Predict the training data
train_R22 = r2_score(y_train, PredTrainSet1) # Calculate the R2 score

validation_R22 = r2_score(y_test, predicted) # Calculate the R2 score
accuracyT = accuracy_score(y_train, PredTrainSet1) # Calculate the accuracy

MSE2= mean_squared_error(y_test, predicted) # Calculate the MSE
RMSE2= np.sqrt(mean_squared_error(y_test, predicted)) # Calculate the RMSE
MAE2= mean_absolute_error(y_test, predicted) # Calculate the MAE

print(f'Out-of-bag score estimate: {rf.oob_score_:.3}') # Print the out-of-bag score estimate
print(f'Test accuracy score: {accuracy:.3}') # Print the test accuracy score
print(f'Train accuracy score: {accuracyT:.3}') # Print the train accuracy score
print(f'MSE: {MSE2:.3}') # Print the MSE
print(f'RMSE2: {RMSE2:.3}') # Print the RMSE
print(f'MAE: {MAE2:.3}') # Print the MAE

# Confusion matrix

In [None]:
cm = confusion_matrix(y_test, predicted) # Calculate the confusion matrix
cm # Print the confusion matrix

In [None]:
row_sums = cm.sum(axis=1, keepdims=True) # Calculate the row sums
norm_conf_mx = cm / row_sums # Normalize the confusion matrix

In [None]:
np.fill_diagonal(norm_conf_mx, 0) # Fill the diagonal with zeros
plt.matshow(norm_conf_mx, cmap=plt.cm.gray) # Plot the normalized confusion matrix
plt.show() # Show the plot

### Heatmap 

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)}) # Set the figure size
cm = pd.DataFrame(confusion_matrix(y_test, predicted), columns=[0,1,2,3,4,5,6], index=[0,1,2,3,4,5,6]) # Create the confusion matrix
sns.heatmap(cm, annot=True) # Plot the heatmap

### Feature importance

In [None]:
# Feature importance for model
explainer = dx.Explainer(rf, X_train_mm, y_train, label='frog_count') # Define the explainer
features = x.columns.to_list() # Define the features
features # Print the features

In [None]:
# Generating dict to identlfy the feature names in the below visualization
di = {} # Define the dictionary
i = 0 # Define the counter
for each in features: # Loop through the features
    di[i] = each #  Assign the feature to the dictionary
    i=i+1 # Increment the counter
di # Print the dictionary

In [None]:
#plotting the variable importance chart
explainer.model_parts().plot() # Plot the model parts

### Permuation importance 

In [None]:

perm_importance = permutation_importance(rf, X_test_mm, y_test) # Calculate the permutation importance

In [None]:
sorted_idx = perm_importance.importances_mean.argsort() # Sort the permutation importance
plt.barh(predicted[sorted_idx], perm_importance.importances_mean[sorted_idx]) # Plot the bar chart
plt.xlabel("Permutation Importance") # Label the x-axis

In [None]:
print(classification_report(y_test, predicted)) # Print the classification report

## Grid Search Cross validation
### Hyperparamater tuning
FYI: takes aprox 14 hours to run with 64 core 372 GB RAM 

In [None]:
param_grid = { 
    'n_estimators': [100, 200,300,400, 700,800,900, 1000, 1200, 1500,3000],
    'max_features': ['auto', 'sqrt', 'log2']
} # Define the parameter grid

In [None]:
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5) # Define the cross-validation
CV_rfc.fit(X_train_mm, y_train) # Fit the model
print(CV_rfc.best_params_) # Print the best parameters

In [None]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [None]:
display(CV_rfc)