In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from IPython.display import clear_output
import sys
from data_loader import find_common_dates_from_datasets



In [23]:
dates = pd.DataFrame({'date':find_common_dates_from_datasets()})
dates

Unnamed: 0,date
0,2010-01-02
1,2010-01-03
2,2010-01-04
3,2010-01-05
4,2010-01-06
...,...
665,2022-10-26
666,2022-10-27
667,2022-10-28
668,2022-10-29


In [2]:
# Useful functions for formatting data sets

def lon_to_longitude(df):
    '''
    Simple function to add latitude and longitude columns if dataframe only contains columns labeled lon and lat
    '''
    if 'lon' in df.columns:
        df['longitude'] = df['lon']
        df.drop('lon', axis = 1, inplace = True)
    
    if 'lat' in df.columns:
        df['latitude'] = df['lat']
        df.drop('lat', axis = 1, inplace = True)
    

def mv_rounder(df, feature = 'none', date = 'none'):
    '''
    Simple function to round latitude and longitude to nearest 0.5, and drop duplicates (averaging value in feature between duplicates).
    Note that latitude and longitude columns must be named 'latitude' and 'longitude' not 'lat' and 'lon'.

    inputs:

    df (Pandas DataFrame) - dataframe to be used
    feature (column name) - if set, feature to average when duplicates dropped
    data (column name) - if set, date column to be used for dropping duplicates
    '''

    if ('latitude' and 'longitude') in df.columns:
        df['longitude'] = df['longitude'].apply(lambda x: round(x/0.5) * 0.5).apply(lambda x: round(x,1))
        df['latitude'] = df['latitude'].apply(lambda x: round(x/0.5) * 0.5).apply(lambda x: round(x,1))

        if date == 'none':
            if feature != 'none':
                df = df.groupby(['latitude', 'longitude']).agg({feature: 'mean'}).reset_index()
            else:
                df.drop_duplicates(subset = ['latitude','longitude'], inplace = True)

        else:
            if feature != 'none':
                df = df.groupby(['latitude', 'longitude', date]).agg({feature: 'mean'}).reset_index()
            else:
                df.drop_duplicates(subset = ['latitude','longitude', date], inplace = True)           
        
        return df

In [3]:
# Import fire data

# CSV file names
fire_folder_path = './USA_fire_date_2010_2023'
csv_fire_files = [file for file in os.listdir(fire_folder_path) if file.endswith('.csv')]

# Loop through each CSV file and create a dataframe for said file, restricting to rough california coordinates
fire_dataframes = {}
for csv_file in csv_fire_files:

    year = int(csv_file.split('_')[2].split('.')[0])
    
    fire_df = pd.read_csv(os.path.join(fire_folder_path, csv_file), usecols = ['latitude','longitude','acq_date'])
    mv_rounder(fire_df, date = 'acq_date')
    fire_df.rename(columns = {'acq_date':'date'}, inplace = True)
    fire_df['fire'] = [1]*len(fire_df)
    
    
    fire_dataframes[f'{year}'] = fire_df

fire_all_data = pd.concat(fire_dataframes, ignore_index = True)
fire_2015 = fire_dataframes['2015']
fire_2015['month'] = fire_2015['date'].apply(lambda x: x[5:7])
len(fire_2015[fire_2015['month'] == '08'])

275

In [4]:
# Import humidity data

# CSV file names
humidity_folder_path = './humidity_data/processed_data'
csv_humidity_files = [file for file in os.listdir(humidity_folder_path) if file.endswith('.csv')]

# Loop through each CSV file and create a dataframe for said file, restricting to rough california coordinates
humidity_dataframes = {}
for csv_file in csv_humidity_files:

    day = csv_file.split('.')[0]
    
    humidity_df = pd.read_csv(os.path.join(humidity_folder_path, csv_file))
    
    lon_to_longitude(humidity_df)
    mv_rounder(humidity_df)
    humidity_df = humidity_df[humidity_df['Qair_f_inst'] != 0.0]
    humidity_df['date'] = [day]*len(humidity_df)
    humidity_df.drop('Unnamed: 0', axis = 1, inplace = True)
    
    humidity_dataframes[f'{day}'] = humidity_df

humidity_dataframes['2015-08-11']
humidity_all_data = pd.concat(humidity_dataframes, ignore_index=True)
humidity_all_data

Unnamed: 0,Qair_f_inst,longitude,latitude,date
0,0.005590,-117.0,33.0,2010-05-01
1,0.004197,-116.0,33.0,2010-05-01
2,0.004090,-115.5,33.0,2010-05-01
3,0.003755,-115.0,33.0,2010-05-01
4,0.006247,-117.5,33.5,2010-05-01
...,...,...,...,...
380242,0.005064,-120.5,41.5,2010-05-18
380243,0.004874,-120.0,41.5,2010-05-18
380244,0.005794,-123.0,42.0,2010-05-18
380245,0.005263,-122.5,42.0,2010-05-18


In [13]:
# Import temperature data

# CSV file names
temperature_folder_path = './temperature_data/processed'
csv_temperature_files = [file for file in os.listdir(temperature_folder_path) if file.endswith('.csv')]

# Loop through each CSV file and create a dataframe for said file, restricting to rough california coordinates
temperature_dataframes = {}
for csv_file in csv_temperature_files:

    day = csv_file.split('.')[0]
    
    temperature_df = pd.read_csv(os.path.join(temperature_folder_path, csv_file)) #,index_col = 'index')
    
    lon_to_longitude(temperature_df)
    mv_rounder(temperature_df)
    temperature_df = temperature_df.dropna()
    temperature_df['date'] = [day]*len(temperature_df)
    temperature_df.drop('Unnamed: 0', axis = 1, inplace = True)
    temperature_df.rename(columns = {'time':'date'}, inplace = True)
    
    temperature_dataframes[f'{day}'] = temperature_df

temperature_all_data = pd.concat(temperature_dataframes, ignore_index=True)
temperature_all_data

Unnamed: 0,AvgSurfT_tavg,longitude,latitude,date
0,-252.23542,-117.0,33.0,2018-09-19
1,-247.22337,-116.0,33.0,2018-09-19
2,-242.25870,-115.5,33.0,2018-09-19
3,-242.66176,-115.0,33.0,2018-09-19
4,-253.22930,-117.5,33.5,2018-09-19
...,...,...,...,...
582136,5.37090,-120.5,41.5,2010-05-18
582137,4.98100,-120.0,41.5,2010-05-18
582138,7.10790,-123.0,42.0,2010-05-18
582139,7.08470,-122.5,42.0,2010-05-18


In [7]:
# Import wind data

# CSV file names
wind_folder_path = './wind_data/wind_data/csv/daily'
csv_wind_files = [file for file in os.listdir(wind_folder_path) if file.endswith('.csv')]

# Loop through each CSV file and create a dataframe for said file, restricting to rough california coordinates
wind_dataframes = {}
for csv_file in csv_wind_files:

    day = csv_file.split('.')[0]
    
    wind_df = pd.read_csv(os.path.join(wind_folder_path, csv_file), usecols = ['SPEEDLML','lon','lat'])
    
    lon_to_longitude(wind_df)
    mv_rounder(wind_df)
    wind_df = wind_df[wind_df['SPEEDLML'] != 0]
    wind_df['date'] = [day]*len(wind_df)
    
    wind_dataframes[f'{day}'] = wind_df

wind_dataframes['2015-08-11']
wind_all_data = pd.concat(wind_dataframes, ignore_index=True)
wind_all_data

Unnamed: 0,SPEEDLML,longitude,latitude,date
0,4.745177,-117.0,33.0,2018-09-19
1,6.861616,-116.0,33.0,2018-09-19
2,4.926813,-115.5,33.0,2018-09-19
3,4.420054,-115.0,33.0,2018-09-19
4,3.574027,-117.5,33.5,2018-09-19
...,...,...,...,...
556467,6.044104,-120.5,41.5,2010-05-18
556468,6.158571,-120.0,41.5,2010-05-18
556469,3.761051,-123.0,42.0,2010-05-18
556470,3.553320,-122.5,42.0,2010-05-18


In [14]:
# Import precipitation data

# CSV file names
precipitation_folder_path = './precipitation_data/.csv/daily'
csv_precipitation_files = [file for file in os.listdir(precipitation_folder_path) if (file.endswith('.csv') and file.startswith('2015'))]

# Loop through each CSV file and create a dataframe for said file, restricting to rough california coordinates
precipitation_dataframes = {}
for csv_file in csv_precipitation_files:

    day = csv_file.split('.')[0]
    
    precipitation_df = pd.read_csv(os.path.join(precipitation_folder_path, csv_file), usecols = ['precipitationCal','lon','lat'])
    
    lon_to_longitude(precipitation_df)
    mv_rounder(precipitation_df)
    precipitation_df = precipitation_df[precipitation_df['precipitationCal'] != 0]
    precipitation_df['date'] = [day]*len(precipitation_df)
    
    precipitation_dataframes[f'{day}'] = precipitation_df

precipitation_all_data = pd.concat(precipitation_dataframes, ignore_index=True)
precipitation_all_data

Unnamed: 0,precipitationCal,longitude,latitude,date
0,1.298932,-119.0,34.5,2015-09-26
1,0.186110,-116.5,34.5,2015-09-26
2,0.272104,-118.5,35.0,2015-09-26
3,0.007401,-118.0,35.0,2015-09-26
4,0.059209,-117.5,35.0,2015-09-26
...,...,...,...,...
10559,0.343548,-120.5,41.0,2015-09-03
10560,0.013281,-123.0,41.5,2015-09-03
10561,0.041836,-121.5,41.5,2015-09-03
10562,0.013478,-121.5,42.0,2015-09-03


In [25]:
# Merge dataframes into one

# Precipitaion merge on left as otherwise it creates extra fire events, fire merge on left to preserve points of no fire
df_total = pd.merge(humidity_all_data, temperature_all_data, on=['latitude', 'longitude','date'], how = 'outer')
df_total = pd.merge(df_total,precipitation_all_data, on=['latitude', 'longitude','date'], how = 'left')
df_total = pd.merge(df_total,wind_all_data, on=['latitude', 'longitude','date'], how = 'outer')
df_total = pd.merge(df_total,dates, on = ['date'], how = 'right') #ensures only common dates to all features done
df_total = pd.merge(df_total, fire_all_data, on=['latitude', 'longitude','date'], how = 'left')

df_total.rename(columns = {'Qair_f_inst':'humidity', 'AvgSurfT_tavg':'temperature', 'precipitationCal':'precipitation', 'SPEEDLML':'wind_speed'}, inplace = True)

df_total.replace(np.nan,0, inplace = True)
df_total.drop_duplicates(subset = ['latitude','longitude', 'date'], inplace = True)
#df_total.drop('month', axis = 1, inplace = True)

print(sum(df_total['fire']))
print(len(fire_all_data))
#print(len(fire_2015[fire_2015['month'] == '08']))
df_total

2116.0
26690


Unnamed: 0,humidity,longitude,latitude,date,temperature,precipitation,wind_speed,fire
0,0.002801,-117.0,33.0,2010-01-02,16.02690,0.0,5.324515,0.0
1,0.002490,-116.0,33.0,2010-01-02,14.43972,0.0,3.492439,0.0
2,0.002753,-115.5,33.0,2010-01-02,16.33618,0.0,5.186181,0.0
3,0.002701,-115.0,33.0,2010-01-02,14.00012,0.0,7.194674,0.0
4,0.003392,-117.5,33.5,2010-01-02,18.03130,0.0,4.882323,0.0
...,...,...,...,...,...,...,...,...
89105,0.003189,-120.5,41.5,2022-10-30,9.81548,0.0,2.130499,0.0
89106,0.003177,-120.0,41.5,2022-10-30,9.12050,0.0,1.359795,0.0
89107,0.005613,-123.0,42.0,2022-10-30,9.70107,0.0,2.106233,0.0
89108,0.005114,-122.5,42.0,2022-10-30,9.26983,0.0,2.280401,0.0


In [28]:
np.unique(df_total['date'])

array(['2010-01-02', '2010-01-03', '2010-01-04', '2010-01-05',
       '2010-01-06', '2010-01-07', '2010-01-08', '2010-01-09',
       '2010-01-10', '2010-01-11', '2010-01-12', '2010-01-13',
       '2010-01-14', '2010-01-15', '2010-01-16', '2010-01-17',
       '2010-01-18', '2010-01-19', '2010-01-20', '2010-01-21',
       '2010-01-22', '2010-01-23', '2010-01-24', '2010-01-25',
       '2010-01-26', '2010-01-27', '2010-01-28', '2010-01-29',
       '2010-01-30', '2010-01-31', '2010-03-01', '2010-03-02',
       '2010-03-03', '2010-03-04', '2010-03-05', '2010-03-06',
       '2010-03-07', '2010-03-08', '2010-03-09', '2010-03-10',
       '2010-03-11', '2010-03-12', '2010-03-13', '2010-03-14',
       '2010-03-15', '2010-03-16', '2010-03-17', '2010-03-18',
       '2010-03-19', '2010-03-20', '2010-03-21', '2010-03-22',
       '2010-03-23', '2010-03-24', '2010-03-25', '2010-03-26',
       '2010-03-27', '2010-03-28', '2010-03-29', '2010-03-30',
       '2010-03-31', '2010-04-01', '2010-04-02', '2010-

In [26]:
df_total.to_csv('./ML_large_dataframe.csv', index = False)

In [None]:
# ----------------------------------------------------
#-----------------------------------------------------
#-----------------------------------------------------