In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score
from sklearn.model_selection import train_test_split

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
dataset=pd.read_csv('Datasets/Algerian_Cleaned_new.csv',usecols=list(range(1,16)))

In [10]:
dataset.rename(str.strip,axis='columns',inplace=True)

In [7]:
dataset.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,Bejaia Region
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,Bejaia Region
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,Bejaia Region
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,Bejaia Region
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,Bejaia Region


#### To create a model with the imbalance data without doing any Feature Engineering,Transformations, .. etc


In [8]:
## Checking the datatypes of the data

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          244 non-null    int64  
 1   month        244 non-null    int64  
 2   year         244 non-null    int64  
 3   Temperature  244 non-null    int64  
 4    RH          244 non-null    int64  
 5    Ws          244 non-null    int64  
 6   Rain         244 non-null    float64
 7   FFMC         244 non-null    float64
 8   DMC          244 non-null    float64
 9   DC           244 non-null    float64
 10  ISI          244 non-null    float64
 11  BUI          244 non-null    float64
 12  FWI          244 non-null    float64
 13  Classes      243 non-null    object 
 14  Region       244 non-null    object 
dtypes: float64(7), int64(6), object(2)
memory usage: 28.7+ KB


In [11]:
## Mapping the target column # Classes

dataset['Classes'].unique()

array(['not fire   ', 'fire   ', 'fire', 'fire ', 'not fire', 'not fire ',
       'not fire     ', nan, 'not fire    '], dtype=object)

In [12]:
dataset['Classes'].replace({"not fire   ":0,'fire   ':1,'fire':1,'not fire':0,'not fire ':0,'not fire     ':0,'not fire    ':0,'fire ':1},inplace=True)

## Fire = 1
## not fire = 0

In [13]:
dataset['Classes'].fillna(1.0,inplace=True)

dataset['Region'].replace({'Bejaia Region':0,'Sidi-Bel Abbes Region':1},inplace=True)

In [15]:
X=dataset.copy()

In [17]:
X.drop(['Classes'],inplace=True,axis=1)

In [18]:
y=dataset['Classes']

In [19]:
X.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0


In [20]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
239    1.0
240    0.0
241    0.0
242    0.0
243    0.0
Name: Classes, Length: 244, dtype: float64

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=10)

In [26]:
logistic_reg=LogisticRegression()
logistic_reg

LogisticRegression()

In [27]:
logistic_reg.fit(X_train,y_train)

LogisticRegression()

In [28]:
print(logistic_reg.coef_)

[[-0.15577008  0.13315984 -0.06108222 -0.41577829 -0.05753678  0.43265809
   0.08480061  1.6050305  -0.12552375  0.02478834  0.65707369 -0.04800354
   0.82100112  0.13817025]]


In [29]:
print(logistic_reg.intercept_)

[-4.3654744e-05]


In [30]:
log_pred=logistic_reg.predict(X_test)

In [31]:
log_pred

array([0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
       1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 0.])

In [33]:
accuracy=accuracy_score(y_test,log_pred)

In [34]:
accuracy

0.9344262295081968

In [36]:
confusion_mat=confusion_matrix(y_test,log_pred)

In [37]:
confusion_mat

array([[19,  0],
       [ 4, 38]], dtype=int64)

In [39]:
## Checking Accuracy through confustion_matrix output

true_pos=confusion_mat[0][0]
fal_pos=confusion_mat[0][1]
fal_neg=confusion_mat[1][0]
tru_neg=confusion_mat[1][1]

In [40]:
accuracy=(true_pos+tru_neg)/(true_pos+fal_pos+fal_neg+tru_neg)
accuracy

0.9344262295081968

In [45]:
precision=true_pos/(true_pos+fal_pos)
precision

1.0

In [46]:
recall=true_pos/(true_pos+fal_neg)
recall

0.8260869565217391

In [47]:
# F1 Score
F1_Score = 2*(recall * precision) / (recall + precision)
F1_Score

0.9047619047619047

-- Performed logistic regression without any Feature Engineering,Feature Transformation