In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from matplotlib import pyplot
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_csv('Fire_Data.csv')
df2 = pd.read_csv('Non_Fire_Data.csv')
#big_data = pd.read_csv('Data_All.csv')
df = pd.concat([df1, df2])

In [None]:
df =df.rename(columns={"Power_Lines_Proximity":"Power Lines Proximity", 
                   "ESA_Worldcover":"LULC",
                   "Forest_Density":"TCD",
                   "Forest_Road_Proximity": "Forest Road Proximity",
                   "Railroad_Proximity": "Railway Proximity",
                   "Road_Proximity": "Road Proximity",
                   "Settlement_Density": "Settlement Density",
                   "Settlement_Proximity": "Settlement Proximity",
                   "Tree_Ages": "Tree Ages",
                   "Tree_Types": "Tree Types",
                   "Water_Proximity": "Water Proximity",
                   "DEM": "Elevation",
                   "Fuels" : "Forest Type"   
                  })

labels = np.asarray(df.Fires) # Read Label Columns as numpy array for Sklearn to understand
feature_cols = ["Aspect",
                'Elevation',
                "LULC",
                "TCD",
                "Forest Type",
                "MNDWI",
                "NDVI",
                "Power Lines Proximity", 
                "Road Proximity",
                "Settlement Density",
                "Settlement Proximity",
                "Tree Types",
                "Tree Ages",
                "Forest Road Proximity",
                "Slope",
                "Water Proximity"                
                                ]

In [None]:
# CLF
labels = np.asarray(df.Fires) # Read Label Columns as numpy array for Sklearn to understand
x = df[feature_cols] #Reading Features from the selected columns in df
y = labels # Reading labels as y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) # Split the data

# Parameter Tuning CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
#Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
clf_random.fit(x_train, y_train)

In [None]:
#This will give the best parameters for RF
clf_random.best_params_

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = clf.KFold(n_splits=10, random_state=42)
model=RandomForestClassifier(n_estimators=50) 

results = model_selection.cross_val_score(estimator=model,
                                          X=features,
                                          y=labels,
                                          cv=kfold,
                                          scoring=scoring)

# Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators = 400,
                             min_samples_split= 10,
                             min_samples_leaf = 4,
                             max_features = 'sqrt',
                             max_depth = 16,
                             bootstrap = True,
                             )
clf.fit(x_train, y_train) 
y_score = clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
prob = clf.predict_proba(x_test)

# #Accuracy

In [None]:

#conf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
#print(conf_matrix)

acc_test = clf.score(x_test, y_test)
print ("Test Accuracy:", acc_test)

'''f1_score(y_true, y_pred, average='weighted') f1_score(y_true, y_pred, average='macro') f1_score(y_true, y_pred, average='micro')'''

y_true = y_test
y_pred = clf.predict(x_test)
y_true = y_true.tolist()
y_pred = y_pred.tolist()
f1_score(y_true, y_pred, average=None)
#calısıyor bozma
f1 = f1_score(y_true, y_pred, zero_division=1)

print ("f1 Accuracy:", f1)

In [None]:
pred = clf.predict(x_test)
pred_prob = clf.predict_proba(x_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 2

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)    
# plotting    
plt.figure(dpi=150)
#plt.plot(fpr[0], tpr[0],lw=1, linestyle='dotted',color='blue', label='Class 0 vs Rest')
#plt.plot(fpr[1], tpr[1], linestyle='--',color='orange', label='Class 1 vs Rest')
plt.plot(fpr[0], tpr[0], lw=1, color='green', label=f'AUC Class 0 = {roc_auc:.3f}')
plt.plot(fpr[1], tpr[1], lw=1, color='orange', label=f'AUC Class 1 = {roc_auc:.3f}')


plt.title(' ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.legend(loc='best')
plt.savefig('ROC Curve',dpi=300);    

In [None]:
big_data = pd.read_csv('Data_All.csv')
big_data = big_data.rename(columns={"Power_Lines_Proximity":"Power Lines Proximity", 
                   "ESA_Worldcover":"LULC",
                   "Forest_Density":"TCD",
                   "Forest_Road_Proximity": "Forest Road Proximity",
                   "Railroad_Proximity": "Railway Proximity",
                   "Road_Proximity": "Road Proximity",
                   "Settlement_Density": "Settlement Density",
                   "Settlement_Proximity": "Settlement Proximity",
                   "Tree_Ages": "Tree Ages",
                   "Tree_Types": "Tree Types",
                   "Water_Proximity": "Water Proximity",
                   "DEM": "Elevation" 
                  })
big_data2 = big_data[feature_cols]

In [None]:
class_code = clf.predict(big_data2)            # Predictions
class_code_prob = clf.predict_proba(big_data2) # Probabilities

In [None]:
Results = pd.DataFrame(class_code_prob, columns = ['Class_0_Prob','Class_1_Prob']) # Writing Probability results into specified columns

In [None]:
Results['Predictions'] = class_code.tolist() # Writing Predictions into Column
Results['X'] = big_data['X'].tolist()        # Writing X Coordinates into column
Results['Y'] = big_data['Y'].tolist()        # Writing Y Coordinates into column
Results.to_csv('RF.csv')          # Writing the df into CSV Table