### 1.  Processing Data

In [37]:
# import necessary libraries

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [38]:
# read two regions data and combine them into one

df1 = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv', header = 1, nrows=122)
df2 = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv', header = 1, skiprows = 125)

# strip extra spaces in columns' names
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()
df1['Classes'] = df1['Classes'].str.lower().str.strip()
df2['Classes'] = df2['Classes'].str.lower().str.strip()
df1 = df1.dropna()
df2 = df2.dropna()
df2['DC'] = df2['DC'].astype(float)
df2['FWI'] = df2['FWI'].astype(float)
# convert string categorial classficiation to numerical classification
class_dict = {'not fire': 0, 'fire': 1}
df1['Classes'] = df1['Classes'].map(class_dict)
df2['Classes'] = df2['Classes'].map(class_dict)


In [39]:
# drop unnecessary attribute (day, mon, year)
df = np.vstack((df1, df2))
cleaned_df = df[:, 3:]
data_x = cleaned_df[:, :-1]
data_y = cleaned_df[:, -1]

In [40]:
print(cleaned_df)
print(cleaned_df.shape)

[[29.  57.  18.  ...  3.4  0.5  0. ]
 [29.  61.  13.  ...  3.9  0.4  0. ]
 [26.  82.  22.  ...  2.7  0.1  0. ]
 ...
 [27.  87.  29.  ...  3.4  0.2  0. ]
 [24.  54.  18.  ...  5.1  0.7  0. ]
 [24.  64.  15.  ...  4.8  0.5  0. ]]
(243, 11)


In [41]:
print(data_x.shape)
print(data_y.shape)

(243, 10)
(243,)


In [42]:
print(data_x)
print(data_y)

[[29.  57.  18.  ...  1.3  3.4  0.5]
 [29.  61.  13.  ...  1.   3.9  0.4]
 [26.  82.  22.  ...  0.3  2.7  0.1]
 ...
 [27.  87.  29.  ...  0.4  3.4  0.2]
 [24.  54.  18.  ...  1.7  5.1  0.7]
 [24.  64.  15.  ...  1.2  4.8  0.5]]
[0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1.
 1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0.
 0. 0. 0.]


In [43]:
# K-Fold Spliting 

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits()

5

In [44]:
print(kf)

KFold(n_splits=5, random_state=42, shuffle=True)


### 2. Base Model

In [45]:
# import necessary libraries
from sklearn.linear_model import LogisticRegression

In [46]:
lg_clf = LogisticRegression(max_iter=2000)
train_acc_vector = []
test_acc_vector = []

for (train_idx, test_idx) in kf.split(data_x):
    train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
    train_y, test_y = data_y[train_idx], data_y[test_idx]
    lg_clf.fit(train_x, train_y)
    train_acc_vector.append(lg_clf.score(train_x, train_y))
    test_acc_vector.append(lg_clf.score(test_x, test_y))
avg_train_acc = np.average(train_acc_vector)
avg_test_acc = np.average(test_acc_vector)

In [47]:
print("avg_train_acc: ", avg_train_acc)
print("avg_test_acc: ", avg_test_acc)

avg_train_acc:  0.9907375099127677
avg_test_acc:  0.9835034013605443


### 3. Upgrade

In [48]:
#1 feature scale (convert changes of all features to the same range ) and L1 penalty (force some coefficient to zero)

In [49]:
# type(data_x)
# from scipy.special import softmax

# temp = np.copy(data_x)
# norm_x = softmax(temp)
# data_x = norm_x

In [50]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data_x)
scaled_x = scaler.transform(data_x)

In [51]:
print(scaled_x)

[[0.35       0.52173913 0.52173913 ... 0.06842105 0.03437967 0.01607717]
 [0.35       0.57971014 0.30434783 ... 0.05263158 0.04185351 0.01286174]
 [0.2        0.88405797 0.69565217 ... 0.01578947 0.02391629 0.00321543]
 ...
 [0.25       0.95652174 1.         ... 0.02105263 0.03437967 0.00643087]
 [0.1        0.47826087 0.52173913 ... 0.08947368 0.05979073 0.02250804]
 [0.1        0.62318841 0.39130435 ... 0.06315789 0.05530643 0.01607717]]


In [52]:
u1_lg = LogisticRegression(max_iter=2000, solver='liblinear', penalty='l1')
u1_train_acc_vector = []
u1_test_acc_vector = []

for (train_idx, test_idx) in kf.split(data_x):
    train_x, test_x = scaled_x[train_idx, :], scaled_x[test_idx, :]
    train_y, test_y = data_y[train_idx], data_y[test_idx]
    u1_lg.fit(train_x, train_y)
    u1_train_acc_vector.append(u1_lg.score(train_x, train_y))
    u1_test_acc_vector.append(u1_lg.score(test_x, test_y))
u1_avg_train_acc = np.average(u1_train_acc_vector)
u1_avg_test_acc = np.average(u1_test_acc_vector)

In [53]:
print("u1_avg_train_acc: ", u1_avg_train_acc)
print("u1_avg_test_acc: ", u1_avg_test_acc)

u1_avg_train_acc:  0.971186888712662
u1_avg_test_acc:  0.946513605442177


In [54]:
u1_lg.coef_

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , 21.180689  ,  0.        ,  1.04770311]])

In [55]:
#2 usging feature transformation (order 2)
# select most relevant features then do the polynomial transformation


In [56]:
reduced_x = scaled_x[:, [7 ,9]]
print(reduced_x)

[[0.06842105 0.01607717]
 [0.05263158 0.01286174]
 [0.01578947 0.00321543]
 [0.         0.        ]
 [0.06315789 0.01607717]
 [0.16315789 0.08038585]
 [0.33684211 0.23151125]
 [0.29473684 0.22829582]
 [0.02105263 0.0096463 ]
 [0.06842105 0.02893891]
 [0.21052632 0.18006431]
 [0.25263158 0.22829582]
 [0.02631579 0.00643087]
 [0.05263158 0.01286174]
 [0.02105263 0.00321543]
 [0.         0.        ]
 [0.         0.        ]
 [0.03684211 0.00643087]
 [0.13157895 0.04501608]
 [0.04736842 0.01286174]
 [0.13684211 0.07073955]
 [0.12631579 0.07395498]
 [0.17368421 0.1221865 ]
 [0.29473684 0.24115756]
 [0.3        0.27009646]
 [0.35263158 0.34083601]
 [0.48421053 0.48231511]
 [0.4        0.44694534]
 [0.11578947 0.12540193]
 [0.37894737 0.414791  ]
 [0.05789474 0.01286174]
 [0.04210526 0.0096463 ]
 [0.06842105 0.01607717]
 [0.14210526 0.05466238]
 [0.25263158 0.15755627]
 [0.29473684 0.21864952]
 [0.14736842 0.10289389]
 [0.31578947 0.25723473]
 [0.05789474 0.0192926 ]
 [0.05789474 0.01607717]


In [57]:
from sklearn.preprocessing import PolynomialFeatures

In [58]:
poly = PolynomialFeatures(2)
polyed_x = poly.fit_transform(reduced_x)

In [59]:
print(polyed_x)

[[1.00000000e+00 6.84210526e-02 1.60771704e-02 4.68144044e-03
  1.10001692e-03 2.58475409e-04]
 [1.00000000e+00 5.26315789e-02 1.28617363e-02 2.77008310e-03
  6.76933491e-04 1.65424262e-04]
 [1.00000000e+00 1.57894737e-02 3.21543408e-03 2.49307479e-04
  5.07700118e-05 1.03390163e-05]
 ...
 [1.00000000e+00 2.10526316e-02 6.43086817e-03 4.43213296e-04
  1.35386698e-04 4.13560654e-05]
 [1.00000000e+00 8.94736842e-02 2.25080386e-02 8.00554017e-03
  2.01387714e-03 5.06611801e-04]
 [1.00000000e+00 6.31578947e-02 1.60771704e-02 3.98891967e-03
  1.01540024e-03 2.58475409e-04]]


In [60]:
# in order to avoid over-fitting, we add L2 penalty
u2_lg_clf = LogisticRegression(max_iter=2000, solver='liblinear', penalty='l2')
u2_train_acc_vector = []
u2_test_acc_vector = []

for (train_idx, test_idx) in kf.split(polyed_x):
    train_x, test_x = polyed_x[train_idx, :], polyed_x[test_idx, :]
    train_y, test_y = data_y[train_idx], data_y[test_idx]
    u2_lg_clf.fit(train_x, train_y)
    u2_train_acc_vector.append(u2_lg_clf.score(train_x, train_y))
    u2_test_acc_vector.append(u2_lg_clf.score(test_x, test_y))
u2_avg_train_acc = np.average(u2_train_acc_vector)
u2_avg_test_acc = np.average(u2_test_acc_vector)

In [61]:
print("u2_avg_train_acc: ", u2_avg_train_acc)
print("u2_avg_test_acc: ", u2_avg_test_acc)

u2_avg_train_acc:  0.9331271477663231
u2_avg_test_acc:  0.9340986394557824


In [62]:
u2_lg_clf.coef_

array([[-0.67283405,  3.90708225,  3.62990218,  1.41051643,  1.20826369,
         1.07496456]])