In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

# Instructions:
## Change the inputs to your desired values and run the entire notebook. The data will be output as big_data.h5 in the same folder as this notebook.
# Change inputs here:
## n_pred is a list of which days you want to predict ([1, 4, 7] indicates you want to predict 1, 4 and 7 days ahead). The list should be in order, with the earliest prediction first.

In [2]:
n_pred = [1,4,7]

## n_hist is the number of days of history you want to include (we always used n_hist = 1).

In [3]:
n_hist = 1

## target is the file name associated with the variable you want to target (see file list in the next cell for a list of possible values).

In [4]:
target = 'temperature'

## file_list is a list of all csv files to be processed. You should not remove elements of this list, but you must swap the target filename with the first element (e.g. temperature should be first if temperature is being targeted). The order after the first element doesn't matter.
## If you want to target weather events, then "events" should be added as the first element, otherwise don't put it in the list (target must also be changed to "events").

In [5]:
file_list = ['temperature','pressure','wind_speed','humidity','wind_direction']

## keep_idx is a list of indices indicating which cities should be included in the data (see cell 10 for the corresponding list of cities). We used all cities (indices 0 to 35) for all our final results.

In [6]:
keep_idx = np.arange(0,36,1)

# Do not change any code below this cell!

In [7]:
def extract_chunk(df,idx,n_hist,n_pred,file,target,include_extra,lat_list,alt_list):
    valid = True
    chunk = None
    num_cities = df.shape[1]-1
    if (idx+n_hist+n_pred[-1]+1)*24 > df.shape[0]:
        valid = False
        return valid, chunk
    chunk = df.iloc[idx*24:(idx+n_hist)*24,:-1].T
    columns = []
    for i in range(n_hist-1,-1,-1):
        for j in range(12,12+24):
            columns.append(file+'_'+str(i)+'_'+str(j % 24))
    chunk.columns = columns
    if include_extra:
        day = df.iloc[idx*24:(idx+n_hist)*24,-1].reset_index(drop=True)
        chunk.insert(0,'day_of_year',day[0].day_of_year)
        chunk.insert(1,'latitude',lat_list)
        chunk.insert(1,'altitude',alt_list)
    if file == target:
        for i in n_pred:
            next_mean = df.iloc[(idx+n_hist+i)*24:(idx+n_hist+i+1)*24,:-1].T.mean(axis=1,skipna=False)
            chunk.insert(0,'mean_'+target+'_'+str(i),next_mean)
    chunk.index = range(idx*num_cities,(idx+1)*num_cities)
    return valid, chunk

In [8]:
def extract_chunk_events(df,idx,n_hist,n_pred,file,target,include_extra,lat_list,alt_list):
    valid = True
    chunk = None
    num_cities = df.shape[1]-1
    if (idx+n_hist+n_pred[-1]+1)*24 > df.shape[0]:
        valid = False
        return valid, chunk
    chunk = pd.DataFrame(df.iloc[idx*24:(idx+n_hist)*24,:-1].T).astype('category')
#     if idx == 0:
#         print(chunk)
    chunk = pd.DataFrame(chunk.iloc[:,0])

    chunk.columns = ['event']

    if include_extra:
        day = df.iloc[idx*24:(idx+n_hist)*24,-1].reset_index(drop=True)
        chunk.insert(0,'day_of_year',day[0].day_of_year)
        chunk.insert(1,'latitude',lat_list)
        chunk.insert(1,'altitude',alt_list)
    if file == target:
        for i in n_pred:
            pass
            next_event = df.iloc[(idx+n_hist+i)*24:(idx+n_hist+i+1)*24,:-1].T.iloc[:,0]
            chunk.insert(0,'event_'+str(i),next_event)
    chunk.index = range(idx*num_cities,(idx+1)*num_cities)
    return valid, chunk

In [9]:
def get_latitudes(city_list):
    df = pd.read_csv('city_attributes.csv')
    lat_list = []
    for city in city_list:
        lat_list.append(df.Latitude[df.City.str.find(city) == 0].values[0])
    return lat_list

In [10]:
city_list = ['Vancouver', 'Portland', 'San Francisco', 'Seattle', 'Los Angeles',
       'San Diego', 'Las Vegas', 'Phoenix', 'Albuquerque', 'Denver',
       'San Antonio', 'Dallas', 'Houston', 'Kansas City', 'Minneapolis',
       'Saint Louis', 'Chicago', 'Nashville', 'Indianapolis', 'Atlanta',
       'Detroit', 'Jacksonville', 'Charlotte', 'Miami', 'Pittsburgh',
       'Toronto', 'Philadelphia', 'New York', 'Montreal', 'Boston',
       'Beersheba', 'Tel Aviv District', 'Eilat', 'Haifa', 'Nahariyya',
       'Jerusalem']
altitudes = [0,50,52,174,305,62,2001,1086,5312,5279,650,430,79,909,830,466,597,
              597,719,738,656,16,761,6,1223,251,39,33,764,141,853,16,1148,33,46,2474]

keep_list = []
alt_list = []
for idx in keep_idx:
    keep_list.append(city_list[idx])
    alt_list.append(altitudes[idx])
lat_list = get_latitudes(keep_list)
keep_list.append('datetime')
master_df = pd.DataFrame()
for file in file_list:
    if file == target:
        include_extra = True
    else:
        include_extra = False
    df = pd.read_csv(file+'.csv')
    df = df.loc[:,keep_list]

    temp = pd.to_datetime(df.datetime)
    df = df.drop('datetime',axis=1)
    df['datetime'] = temp

    gaps = 0
    for i in range(1,df.shape[0]):
        if (df.datetime[i].hour-df.datetime[i-1].hour) % 24 != 1:
            gaps = gaps + 1
    print('Checking',file+'.csv for missing values\n\t'+str(df.isna().sum().sum()),'NaNs\n\t'+str(gaps),'hour gaps')

    print('Checking',file+'.csv size\n\t'+str(df.shape[0]),'rows x',df.shape[1],'columns')
    
    valid = True
    big_chunk = pd.DataFrame()
    idx = 0;
    while valid:
        if file == 'events':
            valid, chunk = extract_chunk_events(df,idx,n_hist,n_pred,file,target,include_extra,lat_list,alt_list)
        else:
            valid, chunk = extract_chunk(df,idx,n_hist,n_pred,file,target,include_extra,lat_list,alt_list)
        if big_chunk.shape[0] == 0:
            big_chunk = chunk
        elif valid:
            big_chunk = pd.concat([big_chunk,chunk])
        idx = idx + 1
    if master_df.shape[0] == 0:
            master_df = big_chunk
    else:
        master_df = master_df.join(big_chunk)

Checking temperature.csv for missing values
	8030 NaNs
	0 hour gaps
Checking temperature.csv size
	45253 rows x 37 columns
Checking pressure.csv for missing values
	16680 NaNs
	0 hour gaps
Checking pressure.csv size
	45253 rows x 37 columns
Checking wind_speed.csv for missing values
	7993 NaNs
	0 hour gaps
Checking wind_speed.csv size
	45253 rows x 37 columns
Checking humidity.csv for missing values
	28651 NaNs
	0 hour gaps
Checking humidity.csv size
	45253 rows x 37 columns
Checking wind_direction.csv for missing values
	7975 NaNs
	0 hour gaps
Checking wind_direction.csv size
	45253 rows x 37 columns


In [11]:
if target == 'events':
    master_df.iloc[:,0:len(n_pred)] = master_df.iloc[:,0:len(n_pred)].astype('category')
    master_df.loc[:,'event'] = master_df.loc[:,'event'].astype('category')

In [12]:
master_df.shape

(67572, 126)

In [13]:
master_df.dropna().shape

(61307, 126)

In [14]:
master_df = master_df.dropna()

In [15]:
master_df.to_hdf('big_data.h5','master_df',mode='w',format='table',complevel=9)

In [16]:
master_df.head(10)

Unnamed: 0,mean_temperature_7,mean_temperature_4,mean_temperature_1,day_of_year,altitude,latitude,temperature_0_12,temperature_0_13,temperature_0_14,temperature_0_15,...,wind_direction_0_2,wind_direction_0_3,wind_direction_0_4,wind_direction_0_5,wind_direction_0_6,wind_direction_0_7,wind_direction_0_8,wind_direction_0_9,wind_direction_0_10,wind_direction_0_11
32,298.917917,299.718333,302.856458,275,1148,29.55805,309.1,310.58,310.495769,310.411538,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
37,285.735,288.664583,288.033958,276,50,45.523449,282.272771,282.281385,282.29,282.51,...,310.0,320.0,290.0,310.0,320.0,310.0,10.0,350.0,320.0,320.0
38,289.655833,288.943333,292.629167,276,52,37.774929,289.158749,289.144375,289.13,290.73,...,270.0,290.0,280.0,250.0,270.0,0.0,210.0,0.0,0.0,0.0
39,287.65,286.956667,284.51125,276,174,47.606209,281.634768,281.627384,281.62,282.71,...,0.0,40.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,292.36,293.704167,294.345833,276,305,34.052231,291.750682,291.745341,291.74,293.97,...,280.0,0.0,0.0,0.0,0.0,0.0,118.0,0.0,0.0,125.0
41,293.288958,293.344583,295.123125,276,62,32.715328,291.750292,291.760146,291.77,294.91,...,290.0,290.0,310.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,296.111042,292.035833,295.934792,276,650,29.42412,290.070304,290.105152,290.14,295.0,...,270.0,0.0,0.0,0.0,0.0,0.0,0.0,330.0,320.0,320.0
47,293.907292,283.55625,295.619583,276,430,32.783058,291.254415,291.322207,291.39,293.97,...,320.0,320.0,350.0,340.0,350.0,350.0,320.0,350.0,340.0,340.0
48,296.059792,294.68,295.681875,276,79,29.763281,289.106465,289.143232,289.18,293.94,...,290.0,260.0,255.0,250.0,260.0,250.0,250.0,240.0,260.0,275.0
50,279.528125,275.388333,290.812292,276,830,44.979969,288.16447,288.222235,288.28,289.36,...,340.0,340.0,350.0,340.0,350.0,360.0,0.0,0.0,320.0,0.0


In [17]:
for i in range(master_df.shape[1]):
    print(master_df.columns[i],master_df.iloc[0,i])

mean_temperature_7 298.91791666666666
mean_temperature_4 299.7183333333333
mean_temperature_1 302.85645833333336
day_of_year 275
altitude 1148
latitude 29.55805
temperature_0_12 309.1
temperature_0_13 310.58
temperature_0_14 310.495769231
temperature_0_15 310.411538462
temperature_0_16 310.327307692
temperature_0_17 310.243076923
temperature_0_18 310.158846154
temperature_0_19 310.074615385
temperature_0_20 309.990384615
temperature_0_21 309.906153846
temperature_0_22 309.821923077
temperature_0_23 309.737692308
temperature_0_0 309.653461538
temperature_0_1 309.569230769
temperature_0_2 309.485
temperature_0_3 309.400769231
temperature_0_4 309.316538462
temperature_0_5 309.232307692
temperature_0_6 309.148076923
temperature_0_7 309.063846154
temperature_0_8 308.979615385
temperature_0_9 308.895384615
temperature_0_10 308.811153846
temperature_0_11 308.726923077
pressure_0_12 1011.0
pressure_0_13 1010.0
pressure_0_14 1010.0
pressure_0_15 1010.0
pressure_0_16 1010.0
pressure_0_17 1010.0
