In [3]:
import pandas as pd
import numpy as np
import re
import time
import dill
from datetime import timedelta
from csv_pkl_sql import save_it, pkl_it

## Scrape appropriate date and location for weather data
First requires finding closest airport for each location.

In [6]:
with open('../pkl/01_latitude_longitude_google.pkl', 'r') as fh:
    lat_long_data = dill.load(fh)
lat_long_data.head(1)

Unnamed: 0,location,latitude,longitude
0,Argentina-Buenos_Aires,-34.603684,-58.381559


In [7]:
with open('../pkl/02_airport_information_fallingrain.pkl', 'r') as fh:
    airport_info = dill.load(fh)
airport_info.head(1)

Unnamed: 0,city,FAA,IATA,ICAO,kind,latitude,longitude,max_runway,name,country,state
56,BAHIA BLANCA,,BHI,SAZB,Medium,-38.725,-62.169,8579.0,COMANDANTE ESPORA,Argentina,


The approximation for closest airport is crude, given that it doesn't convert latitude and longitude to distance but rather uses them directly. Given the relatively short distances involved, I think this is fine for a first pass of this project.

In [8]:
airport_coords = airport_info[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long_data[['latitude','longitude']].values[np.newaxis, :], 0, -1)

dist_coords = ((places_coords - airport_coords)**2).sum(axis=-1)
min_coords = dist_coords.argmin(axis=1)

print airport_coords.shape, places_coords.shape, dist_coords.shape, min_coords.shape

(1, 2062, 2) (1606, 1, 2) (1606, 2062) (1606,)


In [9]:
# Transfer the coordinates to the latitude/longitude data
merge_data = lat_long_data.copy()

print merge_data.shape

merge_data['airport_index'] = airport_info.index[min_coords]

# Now grap the airport and location info
df = airport_info.loc[merge_data.airport_index, ['country','name','FAA','IATA','ICAO']]
merge_data[['country','name','FAA','IATA','ICAO']] = df.set_index(merge_data.index)

print merge_data.shape

(1606, 3)
(1606, 9)


In [10]:
merge_data.head()

Unnamed: 0,location,latitude,longitude,airport_index,country,name,FAA,IATA,ICAO
0,Argentina-Buenos_Aires,-34.603684,-58.381559,80,Argentina,AEROPARQUE JORGE NEWBERY,,AEP,SABE
1,Argentina-CABA,-34.603684,-58.381559,80,Argentina,AEROPARQUE JORGE NEWBERY,,AEP,SABE
2,Argentina-Cordoba,-31.420083,-64.188776,149,Argentina,AMBROSIO L V TARAVELLA,,COR,SACO
3,Argentina-Entre_Rios,-31.774665,-60.495646,398,Argentina,GENERAL URQUIZA,,PRA,SAAP
4,Argentina-Santa_Fe,-31.610658,-60.697294,527,Argentina,SAUCE VIEJO,,SFN,SAAV


In [140]:
# TODO WRITE THIS MATRIX OUT
pkl_it(merge_data, '04_merged_latitude_longitude_airport_checkpoint')

Now combine with infection date data.

In [11]:
with open('../pkl/03_infection_data_initial_import.pkl','r') as fh:
    infection_data = dill.load(fh)
infection_data = infection_data[['report_date','location']]
infection_data.head(1)

Unnamed: 0,report_date,location
0,2016-03-19,Argentina-Buenos_Aires


In [12]:
print infection_data.shape, merge_data.shape

merge_all = pd.merge(infection_data, 
                     merge_data[['location','country','FAA','IATA','ICAO']], 
                     on='location', 
                     how='left').drop_duplicates()

print merge_all.shape

merge_all.head()

(107940, 2) (1606, 9)
(34442, 6)


Unnamed: 0,report_date,location,country,FAA,IATA,ICAO
0,2016-03-19,Argentina-Buenos_Aires,Argentina,,AEP,SABE
6,2016-03-19,Argentina-CABA,Argentina,,AEP,SABE
12,2016-03-19,Argentina-Catamarca,Argentina,,CTC,SANC
18,2016-03-19,Argentina-Chaco,Argentina,,RES,SARE
24,2016-03-19,Argentina-Chubut,Argentina,,REL,SAVT


Now scrape from weather underground. I want time shifted data, so need to get one and two weeks beforehand.

In [13]:
weather_scrape = (merge_all[['report_date','country','IATA','ICAO']]
                  .drop_duplicates()
                  .set_index(['country','IATA','ICAO'])
                  )

weather_scrape['report_date1'] = weather_scrape.report_date - timedelta(days=7)
weather_scrape['report_date2'] = weather_scrape.report_date - timedelta(days=14)

weather_scrape = (weather_scrape
                  .stack()
                  .reset_index(level=-1, drop=True)
                  .reset_index()
                  .rename(columns={0:'report_date'})
                  .dropna(subset=['IATA','ICAO'], how='all')
                 )

weather_scrape.shape

(15060, 4)

In [14]:
# def scrape_weekly_weather(df_row):
#     # Scrape the weekly data table
#     url_fmt = 'https://www.wunderground.com/history/airport/{}/{}/{}/{}/WeeklyHistory.html'
    
#     try:
#         url = url_fmt.format(df_row.ICAO, df_row.report_date.year, 
#                              df_row.report_date.month, df_row.report_date.day)
#     except:
#         url = url_fmt.format(df_row.IATA, df_row.report_date.year, 
#                              df_row.report_date.month, df_row.report_date.day)
    
#     try:
#         table = pd.read_html(url)[0].dropna(subset=['Max','Avg','Min','Sum'], how='all')
#         table.columns = ['Measurement','Max','Avg','Min','Sum']
#         table.set_index('Measurement', inplace=True)
#         table = table.stack()
#     except:
#         table = pd.Series({'NULL':np.NaN}, index=pd.Index([0]))
    
#     return table

def scrape_weekly_weather(date, df_row):
    # Scrape the weekly data table
    url_fmt = 'https://www.wunderground.com/history/airport/{}/{}/{}/{}/WeeklyHistory.html'
    
    try:
        url = url_fmt.format(df_row.ICAO, date.year, 
                             date.month, date.day)
    except:
        url = url_fmt.format(df_row.IATA, date.year, 
                             date.month, date.day)
    
    try:
        table = pd.read_html(url)[0].dropna(subset=['Max','Avg','Min','Sum'], how='all')
        table.columns = ['Measurement','Max','Avg','Min','Sum']
        table.set_index('Measurement', inplace=True)
        table = table.stack()
        time.sleep(1.0)
    except:
        table = pd.Series({'NULL':np.NaN}, index=pd.Index([0]))
    
    return table

In [15]:
date_list = pd.DatetimeIndex(weather_scrape.report_date.sort_values().unique())
airport_list = weather_scrape[['ICAO','IATA']].drop_duplicates()

In [16]:
date_list.shape[0], airport_list.shape[0], date_list.shape[0] * airport_list.shape[0]

(134, 258, 34572)

In [None]:
for ndate, date in enumerate(date_list):
    
    print ndate
    df_list = list()
    
    for num,(row,dat) in enumerate(airport_list.iterrows()):
        
        try:
            df = scrape_weekly_weather(date, dat)
        except:
            df = pd.Series({'NULL':np.NaN}, index=pd.Index([row]))

        df_list.append((date, dat.name, df))
        
    with open('../pkl/df_list{}.pkl'.format(ndate),'w') as fh:
        dill.dump(df_list, fh)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46


In [125]:
def clean_weather_data(entry):
    index = pd.MultiIndex.from_tuples([(entry[0],
                                        entry[1])]*len(entry[2]),
                                      names=['date','index'])
    
    df = pd.DataFrame(entry[2].reset_index().values, 
                      index=index, 
                      columns=['measurement','type','value'])

    mask = (df.measurement.isin(['Max Temperature','Mean Temperature',
                                   'Min Temperature','Dew Point','Precipitation','Wind']))
    df = df.loc[mask]
    
    mask = ((((df.measurement=='Precipitation')&(df.type=='Sum'))|(df.type=='Avg')) & 
            ((df.measurement=='Precipitation')&(df.type=='Avg')).pipe(np.invert))
    df = df.loc[mask].drop(['type'], axis=1)
    
    df['value'] = (df.value
                   .str.replace('-', '')
                   .str.extract(r"""([0-9.-]+)""", expand=True)
                   .astype(float)
                   )
    
    return df

df_clean = list()


for i in range(134):
    with open('../pkl/df_list{}.pkl'.format(i), 'r') as fh:
        df_list = dill.load(fh)
    
    for df in enumerate(df_list):
        if not df[1][2].isnull().all():
            df_clean.append(clean_weather_data(df[1]))

In [126]:
weather_combined = pd.concat(df_clean, axis=0)
weather_combined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,measurement,value
date,index,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-11-14,0,Max Temperature,76.0
2015-11-14,0,Mean Temperature,70.0
2015-11-14,0,Min Temperature,63.0
2015-11-14,0,Dew Point,58.0
2015-11-14,0,Precipitation,3.31


In [139]:
weather_combined = pd.merge(weather_combined.reset_index(level=-1), 
                            airport_list, 
                            left_on='index', 
                            right_index=True).drop(['index'], axis=1).reset_index()

In [None]:
save_it(weather_combined, '04_weekly_weather')