In [1]:
import numpy as np
import pandas as pd
import os

path2data = '../weatherdata'
def realpath(fname):
    return os.path.join(path2data,fname)

### Cleaning and averaging weather data over one month periods.

Because I only need a month scale time precision I average the daily weather data over each month. The resulting dataset will be the features of the HomeRoots model. 
I also only select  ['mintempC', 'maxtempC', 'sunHour', 'cloudcover', 'humidity', 'precipMM', 'pressure', 'tempC', 'windspeedKmph'] as features.

Wheather data sourced from World Weather Online API using https://github.com/ekapope/WorldWeatherOnline

Import data and join all weather data from different locations

In [2]:
files = os.listdir('../weatherdata')
files = [f for f in files if f.endswith('.csv')]

dflist = [pd.read_csv(realpath(x), low_memory=False) for x in files]
mybigdf = pd.concat(dflist, ignore_index=True)


In [3]:
mybigdf.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,location
0,2010-01-01,19,17,0.0,9.1,4,100,07:44 PM,04:57 AM,04:57 AM,...,6,93,96,4.4,1013,19,5,96,4,"-32.785,149.554"
1,2010-01-02,26,18,0.0,12.3,6,85,08:30 PM,06:11 AM,04:58 AM,...,11,62,89,0.7,1009,26,5,249,7,"-32.785,149.554"
2,2010-01-03,26,17,0.0,12.3,4,77,09:10 PM,07:25 AM,04:58 AM,...,13,69,84,0.3,1014,26,7,100,9,"-32.785,149.554"
3,2010-01-04,24,16,0.0,12.3,5,70,09:46 PM,08:36 AM,04:59 AM,...,16,53,79,0.2,1017,24,10,83,12,"-32.785,149.554"
4,2010-01-05,27,15,0.0,14.1,6,63,10:18 PM,09:46 AM,05:00 AM,...,11,37,81,1.7,1016,27,7,198,7,"-32.785,149.554"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452790,2020-05-28,21,10,0.0,16.7,5,39,10:03 AM,01:41 AM,04:53 AM,...,19,20,77,0.0,1034,21,10,145,12,ST55BJ
452791,2020-05-29,20,10,0.0,16.7,5,47,11:23 AM,02:10 AM,04:52 AM,...,21,10,70,0.0,1028,20,10,126,14,ST55BJ
452792,2020-05-30,20,8,0.0,16.6,4,54,12:46 PM,02:34 AM,04:51 AM,...,28,31,73,0.2,1025,20,10,109,18,ST55BJ
452793,2020-05-31,20,9,0.0,15.8,4,58,02:09 PM,02:54 AM,04:50 AM,...,31,24,71,0.5,1025,20,10,103,20,ST55BJ


In [4]:
def date_month_appender(df, date_key, date_format):  # to append date and month columns 
    date_local=df[date_key]
    date_time = pd.to_datetime(date_local, format=date_format)  #'%m/%d/%Y'
    df['date_time'] = date_time
    df['month'] = pd.DatetimeIndex(df['date_time']).month
    df['month_year'] = pd.to_datetime(df['date_time']).dt.to_period('M').astype(str)
    df['year'] = pd.to_datetime(df['date_time']).dt.to_period('Y').astype(str).astype(int)
   
    return df

Function to average over weather data featuers in each month

In [5]:
def month_avger(data):
    dataavg = pd.DataFrame(columns = ['mintempC', 'maxtempC', 'sunHour', 'cloudcover', 'humidity', 'precipMM', 'pressure', 'tempC', 'windspeedKmph', 'zip', 'year', 'month', 'month_year'])
    date_month_appender(data, 'date_time', '%Y-%m-%d')
    data['month_year1'] = data['month_year'] 
    data = data.set_index('month_year1')
    myu = data['month_year'].unique()
    locs = data['location'].unique()
    print(locs)
    avg_feat = ['mintempC', 'maxtempC', 'sunHour', 'cloudcover', 'humidity', 'precipMM', 'pressure', 'tempC', 'windspeedKmph']
    for loc in locs:
        dataatloc = data[data['location']==loc]
        my = dataatloc['month_year'].unique()
        for (i, m) in enumerate(myu):
            if m in my:
                arr =dataatloc.loc[m, avg_feat].to_numpy()
                sp = arr.flatten().shape[0]
                sp0 = int(sp/9)
                arr = arr.reshape(sp0, 9)
                avg =np.mean(arr, axis=0)        
                    
                loctime = dataatloc.loc[m, ['location', 'year', 'month', 'month_year']].drop_duplicates().to_numpy().reshape(4)
            
                avgrow = np.concatenate((avg, loctime)).reshape(1,13)
      
                if i ==0:
                    avgyear = avgrow
            
                if i>0:
         
                    avgyear = np.concatenate((avgyear, avgrow), axis=0)
         
                avgyeardf = pd.DataFrame(avgyear, columns = ['mintempC', 'maxtempC', 'sunHour', 'cloudcover', 'humidity', 'precipMM', 'pressure', 'tempC', 'windspeedKmph', 'zip', 'year', 'month', 'month_year'])
                dataavg = dataavg.append(avgyeardf)
           
    return dataavg

Averaging over one month periods and saving dataset

In [6]:
avgsub = month_avger(mybigdf)

['-32.785,149.554' '-33.754,151.068' '-35.319,149.127' '-37.796,144.961'
 '-37.802,145.031' '-37.857,145.068' '-41.568,146.053' '-41.755,147.069'
 10002 10006 10009 10011 10016 10025 10026 10027 10029 10030 10031 10032
 10034 10035 10036 10038 10039 10301 10310 10451 10452 10453 10454 10455
 10456 10457 10458 10459 10460 10462 10463 10467 10468 10469 10474 11101
 11102 11106 11203 11205 11206 11207 11208 11210 11211 11212 11213 11214
 11215 11216 11217 11218 11220 11221 11222 11223 11224 11225 11231 11233
 11234 11235 11237 11238 11355 11417 11433 11434 11691 12047 12054 12144
 12180 12182 12202 12203 12206 12208 12209 12210 12303 12306 12307 14213
 20001 28403 30058 33054 38105 38126 44906 50313 56560 '57.707,11.967'
 61104 63124 66044 68104 70085 70805 80214 86001 90804 90805 90806 90810
 90813 98115 98136 'K9H3A6' 'ST55BJ']


In [9]:
avgsub.to_csv('../data/avg_weather_data/avgsub4.csv')