In [1]:
import pandas as pd
import os 
import matplotlib.pyplot as plt
import numpy as np

from APP_Constants import APP_constants as CN
from APP_logger.app_logger import APP_Logger
from APP_utils.common_utils import reduce_memory_usage , Read_data_MONGO , profile_report_
from APP_Database_operations.mongodb_operations import MongoDB


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


import pickle

In [2]:
MONGO_DB =CN.MONGO_DB_NAME 
PROCESSED_COLLECTION=CN.MONGO_PROCESSED_COLLECTION 
Conn_obj_Processed = MongoDB( MONGO_DB , PROCESSED_COLLECTION)
processed_data_df = Read_data_MONGO(Conn_obj_Processed , Del_id = True)

Reading the data from the database
Dataframe created successfully


In [3]:
processed_data_df = reduce_memory_usage(processed_data_df)

Memory usage decreased from 0.03MB to 0.01MB (0.02MB, 68.96% reduction)


In [4]:
processed_data_df.head()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,forest
0,-0.920196,1.20463,-0.56743,0.223385,-2.134346,-1.017462,-0.879373,-1.083713,-0.99647,-0.942348,No Fire,1.010363
1,0.779975,-1.629712,-0.213471,-0.370594,0.997411,2.819631,1.965357,1.769359,2.553524,2.629802,Fire,1.010363
2,-0.920196,1.811989,-0.92139,-0.037966,-3.043566,-1.056536,-0.879373,-1.155639,-1.023621,-0.955676,No Fire,-0.989743
3,-0.07011,1.272114,0.494449,-0.370594,0.456209,0.459546,-0.174789,-0.028795,0.245689,0.07065,Fire,1.010363
4,1.063337,-0.549962,0.848409,-0.370594,0.802579,0.373582,1.218133,1.169975,0.69368,1.190279,Fire,-0.989743


In [5]:
X = processed_data_df[CN.Selected_Features_Columns]

In [6]:
y = processed_data_df[CN.Target_Column].map(lambda x: 1 if x == 'Fire' else 0)

In [7]:
Model_logger = APP_Logger("Model_Selection")

In [8]:
Randomforest_Model = RandomForestClassifier(random_state=42)
Svc_Model = SVC(random_state=42)
Logistic_Model = LogisticRegression(random_state=42)
DecisionTree_Model  = DecisionTreeClassifier(random_state=42)
Knn_Model = KNeighborsClassifier()
GradientBoost_Model = GradientBoostingClassifier(random_state=42)

In [17]:
Randomforest_param = {}
Randomforest_param['n_estimators'] = [50, 100 ,150]
Randomforest_param['criterion'] = ['gini', 'entropy']
Randomforest_param['max_features'] = ['auto', 'sqrt', 'log2']
Randomforest_param['max_leaf_nodes'] = np.arange(2,10)
Randomforest_param['bootstrap'] = [True, False]



Svc_param = {}
Svc_param['C'] = np.arange(0.1,10 , 0.5)
Svc_param['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
Svc_param['shrinking'] = [True, False]
Svc_param['probability'] = [True, False]



Logistic_param = {}
Logistic_param['penalty'] = ['l1', 'l2']
Logistic_param['C'] = np.arange(0.1,10 ,0.5)



DecisionTree_param = {}
DecisionTree_param['criterion'] = ['gini', 'entropy']
DecisionTree_param['splitter'] = ['best', 'random']
DecisionTree_param['max_depth'] = np.arange(3,10)
DecisionTree_param['min_samples_split'] = np.arange(2,10)
DecisionTree_param['min_samples_leaf'] = np.arange(1,5)
DecisionTree_param['max_features'] = ['auto', 'sqrt', 'log2']
DecisionTree_param['max_leaf_nodes'] = np.arange(2,10)


Knn_param = {}
Knn_param['n_neighbors'] = np.arange(2,10)
Knn_param['weights'] = ['uniform', 'distance']
Knn_param['algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']
Knn_param['leaf_size'] = np.arange(2,10)



GradientBoost_param = {}
GradientBoost_param['learning_rate'] = np.arange(0.1,10 ,0.5)
GradientBoost_param['n_estimators'] = [50,100]
GradientBoost_param['max_features'] = ['auto', 'sqrt', 'log2']


# Create the parameter grid
Randomforest_grid = GridSearchCV(Randomforest_Model, Randomforest_param, cv=2, n_jobs=-1, scoring='accuracy' , verbose = 2)
Svc_grid = GridSearchCV(Svc_Model, Svc_param, cv=2, n_jobs=-1, scoring='accuracy', verbose = 3)
Logistic_grid = GridSearchCV(Logistic_Model, Logistic_param, cv=2, n_jobs=-1, scoring='accuracy', verbose = 2)
DecisionTree_grid = GridSearchCV(DecisionTree_Model, DecisionTree_param, cv=2, n_jobs=-1, scoring='accuracy', verbose = 2)
Knn_grid = GridSearchCV(Knn_Model, Knn_param, cv=2, n_jobs=-1, scoring='accuracy', verbose = 2)
GradientBoost_grid = GridSearchCV(GradientBoost_Model, GradientBoost_param, cv=2, n_jobs=-1, scoring='accuracy', verbose = 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
Randomforest_grid.fit(X_train, y_train)
Model_logger.info("Randomforest_grid.best_params_: {}".format(Randomforest_grid.best_params_))
Model_logger.info("Randomforest_grid.best_score_: {}".format(Randomforest_grid.best_score_))
Model_logger.info("Randomforest_grid.best_estimator_: {}".format(Randomforest_grid.best_estimator_))

Model_logger.info("Randomforest_grid.score: {}".format(Randomforest_grid.score(X_test, y_test)))


Fitting 2 folds for each of 288 candidates, totalling 576 fits


Randomforest_grid.best_params_: {'bootstrap': False, 'criterion': 'gini', 'max_features': 'auto', 'max_leaf_nodes': 2, 'n_estimators': 100}
Randomforest_grid.best_score_: 0.9870962370962371
Randomforest_grid.best_estimator_: RandomForestClassifier(bootstrap=False, max_leaf_nodes=2, random_state=42)
Randomforest_grid.score: 0.9743589743589743


In [16]:
Svc_grid.fit(X_train, y_train)
Model_logger.info("Svc_grid.best_params_: {}".format(Svc_grid.best_params_))
Model_logger.info("Svc_grid.best_score_: {}".format(Svc_grid.best_score_))
Model_logger.info("Svc_grid.best_estimator_: {}".format(Svc_grid.best_estimator_))

Model_logger.info("Svc_grid.score: {}".format(Svc_grid.score(X_test, y_test)))

Fitting 2 folds for each of 320 candidates, totalling 640 fits


Svc_grid.best_params_: {'C': 1.1, 'kernel': 'linear', 'probability': True, 'shrinking': True}
Svc_grid.best_score_: 0.9806859806859807
Svc_grid.best_estimator_: SVC(C=1.1, kernel='linear', probability=True, random_state=42)
Svc_grid.score: 0.9487179487179487


In [20]:
Logistic_grid.fit(X_train, y_train)
Model_logger.info("Logistic_grid.best_params_: {}".format(Logistic_grid.best_params_))
Model_logger.info("Logistic_grid.best_score_: {}".format(Logistic_grid.best_score_))
Model_logger.info("Logistic_grid.best_estimator_: {}".format(Logistic_grid.best_estimator_))

Model_logger.info("Logistic_grid.score: {}".format(Logistic_grid.score(X_test, y_test)))

Fitting 2 folds for each of 40 candidates, totalling 80 fits


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.96145521        nan 0.96

In [21]:
DecisionTree_grid.fit(X_train, y_train)
Model_logger.info("DecisionTree_grid.best_params_: {}".format(DecisionTree_grid.best_params_))
Model_logger.info("DecisionTree_grid.best_score_: {}".format(DecisionTree_grid.best_score_))
Model_logger.info("DecisionTree_grid.best_estimator_: {}".format(DecisionTree_grid.best_estimator_))

Model_logger.info("DecisionTree_grid.score: {}".format(DecisionTree_grid.score(X_test, y_test)))

Fitting 2 folds for each of 21504 candidates, totalling 43008 fits


DecisionTree_grid.best_params_: {'criterion': 'gini', 'max_depth': 3, 'max_features': 'auto', 'max_leaf_nodes': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
DecisionTree_grid.best_score_: 0.9742757242757243
DecisionTree_grid.best_estimator_: DecisionTreeClassifier(max_depth=3, max_features='auto', max_leaf_nodes=3,
                       random_state=42)
DecisionTree_grid.score: 0.9230769230769231


In [22]:
Knn_grid.fit(X_train, y_train)
Model_logger.info("Knn_grid.best_params_: {}".format(Knn_grid.best_params_))
Model_logger.info("Knn_grid.best_score_: {}".format(Knn_grid.best_score_))
Model_logger.info("Knn_grid.best_estimator_: {}".format(Knn_grid.best_estimator_))

Model_logger.info("Knn_grid.score: {}".format(Knn_grid.score(X_test, y_test)))

Fitting 2 folds for each of 512 candidates, totalling 1024 fits


Knn_grid.best_params_: {'algorithm': 'auto', 'leaf_size': 2, 'n_neighbors': 3, 'weights': 'uniform'}
Knn_grid.best_score_: 0.9806859806859807
Knn_grid.best_estimator_: KNeighborsClassifier(leaf_size=2, n_neighbors=3)
Knn_grid.score: 0.9487179487179487


In [18]:
GradientBoost_grid.fit(X_train, y_train)
Model_logger.info("GradientBoost_grid.best_params_: {}".format(GradientBoost_grid.best_params_))
Model_logger.info("GradientBoost_grid.best_score_: {}".format(GradientBoost_grid.best_score_))
Model_logger.info("GradientBoost_grid.best_estimator_: {}".format(GradientBoost_grid.best_estimator_))

Model_logger.info("GradientBoost_grid.score: {}".format(GradientBoost_grid.score(X_test, y_test)))

Fitting 2 folds for each of 120 candidates, totalling 240 fits


GradientBoost_grid.best_params_: {'learning_rate': 6.1, 'max_features': 'sqrt', 'n_estimators': 50}
GradientBoost_grid.best_score_: 0.9870962370962371
GradientBoost_grid.best_estimator_: GradientBoostingClassifier(learning_rate=6.1, max_features='sqrt',
                           n_estimators=50, random_state=42)
GradientBoost_grid.score: 0.9487179487179487


In [9]:
Randomforest_Model = RandomForestClassifier(bootstrap = False, criterion= 'gini', max_features = 'auto', max_leaf_nodes =  2, n_estimators =  100)

In [10]:
Randomforest_Model.fit(X, y)

RandomForestClassifier(bootstrap=False, max_leaf_nodes=2)

In [11]:
MONGO_DB =CN.MONGO_DB_NAME 
TEST_COLLECTION=CN.MONGO_TEST_COLLECTION
Conn_obj_Test = MongoDB( MONGO_DB , TEST_COLLECTION)
Test_data_df = Read_data_MONGO(Conn_obj_Test , Del_id = True)

Reading the data from the database
Dataframe created successfully


In [12]:
Test_data_df = reduce_memory_usage(Test_data_df)

Memory usage decreased from 0.01MB to 0.00MB (0.01MB, 76.54% reduction)


In [13]:
Test_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   day          49 non-null     int8    
 1   month        49 non-null     int8    
 2   year         49 non-null     int16   
 3   Temperature  49 non-null     float32 
 4   RH           49 non-null     float32 
 5   Ws           49 non-null     float32 
 6   Rain         49 non-null     float32 
 7   FFMC         49 non-null     float32 
 8   DMC          49 non-null     float32 
 9   DC           49 non-null     float32 
 10  ISI          49 non-null     float32 
 11  BUI          49 non-null     float32 
 12  FWI          49 non-null     float32 
 13  Classes      49 non-null     category
 14  forest       49 non-null     category
dtypes: category(2), float32(10), int16(1), int8(2)
memory usage: 2.6 KB


In [14]:
data =  Test_data_df.copy()

In [15]:
categorical_columns =data.select_dtypes(include=['category']).columns

In [16]:
data[categorical_columns] =  data[categorical_columns].apply(lambda x: x.cat.codes)

In [17]:
scaler = pickle.load(open('Artifacts/SCALER_DIR/2022-05-30_scaler.pkl', 'rb'))

In [18]:
data.drop(['day' , 'month' , 'year'], axis=1, inplace=True)

In [19]:
data[data.columns] = scaler.transform(data[data.columns])

In [20]:
X_test = data[CN.Selected_Features_Columns]

In [21]:
y_test = Test_data_df[CN.Target_Column]

In [22]:
Y_pred = Randomforest_Model.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
y_test = y_test.map(lambda x: 1 if x == "Fire" else 0)

In [26]:
accuracy_score(y_test, Y_pred)

1.0

In [27]:
Randomforest_Model.score(X, y)

0.9742268041237113

In [28]:
pickle.dump(Randomforest_Model, open('Artifacts/MODEL_DIR/Randomforest_Model.pkl', 'wb'))