In [18]:
import netCDF4
import xarray as xr
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split

In [4]:
#NOTE: File paths should be the same if you are working in the repo.
#Also, I would push these csv's but they're bigger than the max file size lmao
wind = xr.open_dataset('../Data/new_raw_data/agg_terraclimate_windspeed_1979_2020.nc')
drought = xr.open_dataset('../Data/new_raw_data/Drought_PDSI_1979_2020_GLOBE.nc')
wind.ws.to_dataframe().to_csv('../Data/new_raw_data/windspeed.csv')
drought.PDSI.to_dataframe().to_csv('../Data/new_raw_data/drought.csv')

In [5]:
wind_df = pd.read_csv('../data/new_raw_data/windspeed.csv')
wind_df = np.round(wind_df, decimals=1)
wind_df.head(5)

Unnamed: 0,time,lat,lon,ws
0,1979-01-01,41.0,-109.0,2.9
1,1979-01-01,41.0,-109.0,2.8
2,1979-01-01,41.0,-108.9,2.9
3,1979-01-01,41.0,-108.9,2.7
4,1979-01-01,41.0,-108.9,2.6


In [6]:
drought_df = pd.read_csv('../data/new_raw_data/drought.csv')
drought_df = np.round(drought_df, decimals=1)
drought_df.head()

Unnamed: 0,time,lat,lon,PDSI
0,1979-01-01,37.0,-109.0,4.2
1,1979-01-01,37.0,-109.0,4.2
2,1979-01-01,37.0,-108.9,4.1
3,1979-01-01,37.0,-108.9,4.1
4,1979-01-01,37.0,-108.9,3.9


In [7]:
wildfire_df = pd.read_csv('../Data/Clean Data/Wildfires/cleaned_wildfires.csv')
wildfire_df = np.round(wildfire_df, decimals=1)
wildfire_df.head()

Unnamed: 0,LATITUDE,LONGITUDE,DISCOVERY_DATE
0,37.3,-102.8,7/7/2005
1,39.4,-105.1,6/28/2005
2,37.2,-103.2,7/24/2005
3,37.0,-102.6,7/24/2005
4,37.3,-102.7,7/17/2005


In [8]:
wildfire_df['DISCOVERY_DATE'].apply(str)
date_list = wildfire_df['DISCOVERY_DATE'].tolist()
fire_list = []
for i in range(len(date_list)):
    fire_list.append(1)
    if not date_list[i].endswith('01'):
        date_list[i] = date_list[i][:-2] + "01"
wildfire_df['time'] = date_list
wildfire_df['fire'] = fire_list
wildfire_df = wildfire_df.drop(['DISCOVERY_DATE'], axis=1)
wildfire_df = wildfire_df.rename({'LATITUDE': 'lat', 'LONGITUDE': 'lon'}, axis=1)
wildfire_df = wildfire_df[['time', 'lat', 'lon', 'fire']]
wildfire_df.head()

Unnamed: 0,time,lat,lon,fire
0,7/7/2001,37.3,-102.8,1
1,6/28/2001,39.4,-105.1,1
2,7/24/2001,37.2,-103.2,1
3,7/24/2001,37.0,-102.6,1
4,7/17/2001,37.3,-102.7,1


In [9]:
final_df = pd.merge(wind_df, drought_df, how='outer')
final_df = final_df.dropna()
final_df = pd.merge(final_df, wildfire_df, how='outer')
final_df.head()

Unnamed: 0,time,lat,lon,ws,PDSI,fire
0,1979-01-01,37.0,-109.0,2.0,4.2,
1,1979-01-01,37.0,-109.0,2.0,4.2,
2,1979-01-01,37.0,-109.0,2.0,4.2,
3,1979-01-01,37.0,-109.0,2.0,4.2,
4,1979-01-01,37.0,-109.0,2.0,4.2,


In [10]:
final_df.shape

(425804, 6)

In [11]:
final_df = final_df[final_df['time'].between('2000-01-01', '2015-12-01')]
# print(final_df.dtypes)
final_df.tail()

Unnamed: 0,time,lat,lon,ws,PDSI,fire
373843,2015-12-01,37.0,-102.1,4.9,5.5,
373844,2015-12-01,37.0,-102.1,4.9,5.3,
373845,2015-12-01,37.0,-102.1,4.9,5.1,
373846,2015-12-01,37.0,-102.0,4.9,4.8,
373847,2015-12-01,37.0,-102.0,4.9,4.8,


In [12]:
final_df = final_df[final_df['ws'].notna()]
final_df = final_df[final_df['PDSI'].notna()]
final_df.head()

Unnamed: 0,time,lat,lon,ws,PDSI,fire
212184,2000-01-01,37.0,-109.0,1.9,-3.5,
212185,2000-01-01,37.0,-109.0,1.9,-3.5,
212186,2000-01-01,37.0,-109.0,1.9,-3.5,
212187,2000-01-01,37.0,-109.0,1.9,-3.5,
212188,2000-01-01,37.0,-109.0,1.9,-3.5,


In [13]:
final_df.shape

(161664, 6)

In [15]:
final_df.to_csv("../Data/Clean Data/final_data.csv", encoding='utf-8', index=False)

In [20]:
X = final_df[['lat','lon','ws', 'PDSI']]
y = final_df['fire']
fraction_split = 0.2

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=fraction_split)
print(X_train)

         lat    lon   ws  PDSI
256707  37.0 -102.9  5.9  -1.6
315166  37.0 -106.9  3.1   0.1
352534  37.0 -104.2  4.7  -1.9
214007  37.0 -107.9  2.8  -2.6
277610  37.0 -104.1  4.8  -3.7
...      ...    ...  ...   ...
363795  37.0 -108.6  1.6   1.1
329699  37.0 -105.1  3.8  -4.5
217487  37.0 -106.9  2.8  -2.6
321695  37.0 -108.6  2.8   1.0
242638  37.0 -107.9  1.7  -3.1

[129331 rows x 4 columns]


In [30]:
training_data = [X_train, y_train]
testing_data = [X_test, y_test]
training_data.to_csv("../Data/Clean Data/training_data.csv", encoding='utf-8', index=False)
testing_data.to_csv("../Data/Clean Data/testing_data.csv", encoding='utf-8', index=False)

  values = np.array([convert(v) for v in values])


ValueError: could not broadcast input array from shape (129331,4) into shape (129331,)