# Algerian Forest Fire Data - Bejaia Region Cleaning

###### imports / reads

In [1]:
import pandas as pd

In [13]:
bejaia = pd.read_csv('alg_bejaia_region.csv')
sid = pd.read_csv('alg_sidibelabbes_region.csv')

## Cleaning

### Beijaia Region

In [3]:
bejaia.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [4]:
bejaia.columns

Index(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes  '],
      dtype='object')

In [5]:
# Making headers uniform - lower case and no spaces

bejaia.columns = bejaia.columns.str.lower()
bejaia.columns = bejaia.columns.str.strip()
bejaia.columns

Index(['day', 'month', 'year', 'temperature', 'rh', 'ws', 'rain', 'ffmc',
       'dmc', 'dc', 'isi', 'bui', 'fwi', 'classes'],
      dtype='object')

In [6]:
# Creating new 'date' column

bejaia_clean = bejaia
bejaia_clean['date'] = pd.to_datetime(bejaia[['day', 'month', 'year']])

# Rearranging the columns
bejaia_clean = bejaia_clean[['date', 'day', 'month', 'year', 'temperature', 'rh', 'ws', 'rain', 'ffmc',
       'dmc', 'dc', 'isi', 'bui', 'fwi', 'classes']]
bejaia_clean.columns

Index(['date', 'day', 'month', 'year', 'temperature', 'rh', 'ws', 'rain',
       'ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi', 'classes'],
      dtype='object')

In [7]:
# Double checking data types elements in the dataframe
bejaia_clean.dtypes

date           datetime64[ns]
day                     int64
month                   int64
year                    int64
temperature             int64
rh                      int64
ws                      int64
rain                  float64
ffmc                  float64
dmc                   float64
dc                    float64
isi                   float64
bui                   float64
fwi                   float64
classes                object
dtype: object

In [8]:
# Stripping extra spaces in the 'classes'

bejaia_clean.loc[:, 'classes'] = bejaia_clean['classes'].str.strip()
bejaia_clean['classes'].unique()

array(['not fire', 'fire'], dtype=object)

In [9]:
# Checking for, and counting NaNs

bejaia_clean.isnull().sum()

date           0
day            0
month          0
year           0
temperature    0
rh             0
ws             0
rain           0
ffmc           0
dmc            0
dc             0
isi            0
bui            0
fwi            0
classes        0
dtype: int64

In [10]:
# Writing it out to csv

bejaia_clean.to_csv('bejaia_region_clean.csv', index = False)
# Reading it back in to double check

pd.read_csv('bejaia_region_clean.csv').head()

Unnamed: 0,date,day,month,year,temperature,rh,ws,rain,ffmc,dmc,dc,isi,bui,fwi,classes
0,2012-06-01,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2012-06-02,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,2012-06-03,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,2012-06-04,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,2012-06-05,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


### Sidi Bel Abbes Region

In [14]:
sid.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,32,71,12,0.7,57.1,2.5,8.2,0.6,2.8,0.2,not fire
1,2,6,2012,30,73,13,4.0,55.7,2.7,7.8,0.6,2.9,0.2,not fire
2,3,6,2012,29,80,14,2.0,48.7,2.2,7.6,0.3,2.6,0.1,not fire
3,4,6,2012,30,64,14,0.0,79.4,5.2,15.4,2.2,5.6,1.0,not fire
4,5,6,2012,32,60,14,0.2,77.1,6.0,17.6,1.8,6.5,0.9,not fire


In [15]:
sid.columns

Index(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes  '],
      dtype='object')

In [16]:
# Making headers uniform - lower case and no spaces
sid.columns = sid.columns.str.lower()
sid.columns = sid.columns.str.strip()
sid.columns

Index(['day', 'month', 'year', 'temperature', 'rh', 'ws', 'rain', 'ffmc',
       'dmc', 'dc', 'isi', 'bui', 'fwi', 'classes'],
      dtype='object')

In [17]:
# Creating 'date' column
sid_clean = sid
sid_clean['date'] = pd.to_datetime(sid[['day', 'month', 'year']])

# Rearranging the columns
sid_clean = sid_clean[['date', 'day', 'month', 'year', 'temperature', 'rh', 'ws', 'rain', 'ffmc',
       'dmc', 'dc', 'isi', 'bui', 'fwi', 'classes']]
sid_clean.head(3)

Unnamed: 0,date,day,month,year,temperature,rh,ws,rain,ffmc,dmc,dc,isi,bui,fwi,classes
0,2012-06-01,1,6,2012,32,71,12,0.7,57.1,2.5,8.2,0.6,2.8,0.2,not fire
1,2012-06-02,2,6,2012,30,73,13,4.0,55.7,2.7,7.8,0.6,2.9,0.2,not fire
2,2012-06-03,3,6,2012,29,80,14,2.0,48.7,2.2,7.6,0.3,2.6,0.1,not fire


In [18]:
# Checking other elements in the dataframe
sid_clean.dtypes

date           datetime64[ns]
day                     int64
month                   int64
year                    int64
temperature             int64
rh                      int64
ws                      int64
rain                  float64
ffmc                  float64
dmc                   float64
dc                     object
isi                   float64
bui                   float64
fwi                    object
classes                object
dtype: object

In [19]:
sid_clean.isnull().sum()

date           0
day            0
month          0
year           0
temperature    0
rh             0
ws             0
rain           0
ffmc           0
dmc            0
dc             0
isi            0
bui            0
fwi            0
classes        1
dtype: int64

The 'fwi' and 'dc' fields are typed as object instead of float.  After inspection, it looks like "fire" was entered into the 'fwi' column instead of the 'classes' column.  There was also a typo in the 'dc' column.

In [20]:
# Changing NaN to fire
sid_clean['classes'] = sid_clean['classes'].fillna('fire')

# Stripping extra spaces
sid_clean.loc[:, 'classes'] = sid_clean['classes'].str.strip()
sid_clean['classes'].unique() # This line double checks that there are only 2 unique values

array(['not fire', 'fire'], dtype=object)

In [21]:
# Changing the error in 'fwi' to a null value
sid_clean.loc[sid_clean.index[43], 'fwi'] = None

sid_clean.isnull().sum() # There should now only be 1 null value in the 'fwi' column

date           0
day            0
month          0
year           0
temperature    0
rh             0
ws             0
rain           0
ffmc           0
dmc            0
dc             0
isi            0
bui            0
fwi            1
classes        0
dtype: int64

In [22]:
# In line 43, the 'dc' value was entered as "14.6 9"
# I am assuming the extra 9 was added by mistake, so I will remove it

sid_clean.at[43, 'dc'] = 14.6

In [23]:
# Changing the 'fwi' and 'dc' columns to floats
sid_clean['fwi'] = sid_clean['fwi'].astype(float)
sid_clean['dc'] = sid_clean['dc'].astype(float)

# Double checking the data types
sid_clean.dtypes

date           datetime64[ns]
day                     int64
month                   int64
year                    int64
temperature             int64
rh                      int64
ws                      int64
rain                  float64
ffmc                  float64
dmc                   float64
dc                    float64
isi                   float64
bui                   float64
fwi                   float64
classes                object
dtype: object

In [24]:
sid_clean.head()

Unnamed: 0,date,day,month,year,temperature,rh,ws,rain,ffmc,dmc,dc,isi,bui,fwi,classes
0,2012-06-01,1,6,2012,32,71,12,0.7,57.1,2.5,8.2,0.6,2.8,0.2,not fire
1,2012-06-02,2,6,2012,30,73,13,4.0,55.7,2.7,7.8,0.6,2.9,0.2,not fire
2,2012-06-03,3,6,2012,29,80,14,2.0,48.7,2.2,7.6,0.3,2.6,0.1,not fire
3,2012-06-04,4,6,2012,30,64,14,0.0,79.4,5.2,15.4,2.2,5.6,1.0,not fire
4,2012-06-05,5,6,2012,32,60,14,0.2,77.1,6.0,17.6,1.8,6.5,0.9,not fire


In [25]:
# Writing it out to csv
sid_clean.to_csv('sidibelabbes_region_clean.csv', index = False)
pd.read_csv('sidibelabbes_region_clean.csv').head(3)

Unnamed: 0,date,day,month,year,temperature,rh,ws,rain,ffmc,dmc,dc,isi,bui,fwi,classes
0,2012-06-01,1,6,2012,32,71,12,0.7,57.1,2.5,8.2,0.6,2.8,0.2,not fire
1,2012-06-02,2,6,2012,30,73,13,4.0,55.7,2.7,7.8,0.6,2.9,0.2,not fire
2,2012-06-03,3,6,2012,29,80,14,2.0,48.7,2.2,7.6,0.3,2.6,0.1,not fire
