# Basic analysis of data.

In today's class we will do some basic analysis of weather data.

Our source of data will be the <a href="http://climate.weather.gc.ca/historical_data/search_historic_data_e.html">Canadian Weather Database</a>.

Let's start by looking for the data from Victoria. 

Let's start by downloading the data and checking to see that we have a "full set". 

In [14]:
## Let's start by loading all the weather data, storing it in a list of dictionaries. An element of 
## this list will consist of the weather file headers, turned into a dict-object.  

import datetime as dt
import os as os

with open ("eng-daily-01012017-12312017.csv", encoding="utf-8") as f:
    content = f.readlines()


In [6]:
print(type(content))
print(content[0])
print(content[10])

<class 'list'>
﻿"Station Name","VICTORIA INTL A"

"A","Accumulated"



In [12]:
## let's use dictionary objects

L = ["a", "ab", "abc"] ## list object
print(L)
print(L[1]) ## indexed by integers 0, 1, 2, ...

D = {"name":"ryan", "age":"old"}
print(D)
D["name"]
print(D.keys())


['a', 'ab', 'abc']
ab
{'name': 'ryan', 'age': 'old'}
dict_keys(['name', 'age'])


In [18]:
keys = content[25].split(",")

fdat = []
## let's turn the eng* file into a list of dictionary objects, one for every day.
for i in range(26, len(content)):
    dat = {keys[j]:content[i].split(",")[j] for j in range(len(keys))}
    fdat.append(dat)
    

"Month"


In [21]:
print(type(fdat))
print(len(fdat))
print(fdat[0])

<class 'list'>
365
{'"Year"': '"2017"', '"Cool Deg Days Flag"': '""', '"Data Quality"': '"‡"', '"Dir of Max Gust (10s deg)"': '"2"', '"Min Temp Flag"': '""', '"Total Snow (cm)"': '"0.0"', '"Mean Temp Flag"': '""', '"Max Temp (°C)"': '"4.9"', '"Spd of Max Gust (km/h)"': '"54"', '"Max Temp Flag"': '""', '"Total Rain Flag"': '""', '"Heat Deg Days Flag"': '""', '"Total Rain (mm)"': '"1.4"', '"Total Precip (mm)"': '"1.4"', '"Snow on Grnd Flag"': '""', '"Min Temp (°C)"': '"-1.6"', '"Total Snow Flag"': '""', '"Date/Time"': '"2017-01-01"', '"Day"': '"01"', '"Spd of Max Gust Flag"\n': '""\n', '"Snow on Grnd (cm)"': '""', '"Dir of Max Gust Flag"': '""', '"Heat Deg Days (°C)"': '"16.3"', '"Cool Deg Days (°C)"': '"0.0"', '"Mean Temp (°C)"': '"1.7"', '"Month"': '"01"', '"Total Precip Flag"': '""'}


In [None]:
## There will also be an additional weather-table key that will contain a list of dict objects. 
## containing the data the from a line of the file. 

import datetime as dt
import os as os
import fnmatch as fn
from operator import itemgetter

wsubdir = fn.filter(os.listdir('.'), "w.*")
print("Weather subdirectories:", wsubdir)

## each file is of the form "key", "data" for several lines, then a blank space
## "Legend" then several (ignorable) lines, then a blank space
## "keys" separated by commas then
## remaining lines are the key values for each measurement. 

masterList = [] ## list of weather station dict objects

for wd in wsubdir:
    files = fn.filter(os.listdir(wd), "eng-daily*.csv")
    for wdf in files:
        with open('./'+wd+'/'+wdf) as f:
            blanks=0
            content = f.readlines()
            FD = dict() ## file dict head.
            FL = [] ## file list.
            keys = []
            for LN in content:
                PL = LN.replace(u'\ufeff','').split(",")
                for i in range(len(PL)): # this removes the quotes around the numbers.
                    PL[i] = PL[i].translate({ord(c): None for c in '"\n'})
                if len(PL)==1 and PL[0]=='':
                    blanks+=1
                    if blanks==2 and PL[0]=='':
                        continue
                ## if blanks==0 we need to build the main dict
                ## if blanks==1 we ignore
                ## if blanks==2 we build the data.
                if blanks==0:
                    FD[PL[0]] = PL[1]
                    if PL[0]=="Station Name":
                        FD["Data"] = []
                if blanks==2:
                    if PL[0]=="Date/Time":
                        keys = PL
                    else:
                        FD["Data"].append( { keys[i] : PL[i] for i in range(len(keys)) } )
            masterList.append(FD)        

## make set of "Station Name"s. 
## use it to merge common station names into one (more useful) masterList.
sNames = { x['Station Name'] for x in masterList}
print("Station names:", sNames)

## return index of list of dict if dict keyed-element exists, -1 otherwise
def indIfExists(LOD, keyname, value):
    for i in range(len(LOD)):
        if LOD[i][keyname] == value:
            return i
    return -1

## run through masterlist, if that airport exists in mList we merge, if not, we copy it over.
mList = []
while len(masterList)>0:
    I = indIfExists(mList, 'Station Name', masterList[-1]['Station Name'])
    if I<0:
        mList.append(masterList.pop())
    else:
        mList[I]["Data"].extend(masterList.pop()["Data"])

## run through the weather station data, replace the date record with a datetime object
for WS in mList:
    badItems = set()
    for i in range(len(WS["Data"])):
        ## Let's convert the dict from strings to appropriate datetime and float objects
        ## dates with bad data we will flag and remove.
        for key, value in WS["Data"][i].items():
            if key=="Date/Time":
                WS["Data"][i][key] = dt.datetime.strptime(value, "%Y-%m-%d")
            elif key in [ 'Min Temp (°C)', 'Total Precip (mm)',  \
                          'Max Temp (°C)', 'Mean Temp (°C)']:
                try:
                    WS["Data"][i][key] = float(value)
                except:
                    badItems.add( i )
    #print(len(WS["Data"]), end=" ")
    WS["Data"] = [WS["Data"][i] for i in range(len(WS["Data"])) if i not in badItems]
    #print(len(WS["Data"]), "\n")
    WS["Data"] = sorted(WS["Data"], key=itemgetter("Date/Time"))

## now let's check for gaps.  We run through the weather station date and check the next day 
## is the previous day + one day.  If not, we start a new interval. 
print("Data intervals: ")
for WS in mList:
    print(WS['Station Name'], end=" ")
    ## find maximal consecutive date interval in data, then move on.
    i0 = 0 ## start interval
    i1 = 0 ## end interval
    ## if date interval [i0,i1] can be expanded to be consecutive, do it.
    ## if not, and if i1 not end of list, move to [i1+1,i1+1] and repeat
    while i1+1 != len(WS['Data']):
        ## if we can increment i1 and keep [i0,i1] consecutive, do it.
        if WS['Data'][i1+1]["Date/Time"]-WS['Data'][i0]["Date/Time"] != dt.timedelta(1+i1-i0):
            ## not consecutive. Move on.
            print(WS['Data'][i0]["Date/Time"].date(), "--", WS['Data'][i1]["Date/Time"].date(), "/ ", end='')
            i0 = i1+1
            i1 = i1+1
        else:
            i1 = i1+1
    print(WS['Data'][i0]["Date/Time"].date(), "--", WS['Data'][i1]["Date/Time"].date()) 
    
