## store specified station data in a database

In [37]:
import pandas as pd
import numpy as np

def append_row(data_frame, row_items):
    '''Add a new row to a data frame'''

    item_array = np.array(row_items).reshape(1,len(data_frame.columns))
    
    item_data = pd.DataFrame(item_array,columns=data_frame.columns) 
    
    return pd.concat([data_frame, item_data], axis=0)


def insert_line(line, data, station):
    ''' Read the data entries on a given line in the text file
        return: A list containing the entries on the line
    '''
    cols = line.strip().split(" ")
    items = [station]
    
    for item in cols:
        if item!='':
            items.append(item)
            if len(items)>=8:
                break
    
    # DEBUGGING
    if len(items)<8:
        print(station, items)
        
    # make sure the row item matches the dataframe size
    if len(items)==len(data.columns):
        data = append_row(data, items)   
        
    return data

In [38]:
def get_names(name_list):
    '''Retrieves and cleans list of station names'''
    names = open(name_list, "r").readlines()
    clean_names = []
    
    for name in names:
        clean_names.append(name.replace("\n", ""))
        
    return clean_names

In [39]:
def add_station(station, data):
    '''Adds the data of a specified station to a dataframe'''
    
    lines = open(station+".txt").readlines()
    
    for line in range(8, len(lines)):
        data = insert_line(lines[line], data, station)
        
    return data

## Load data from all stations into a dataframe

In [40]:
column_names = ["station_name", "year", "month", "tmax_degC", "tmin_degC", "af_days", "rain_mm", "sun_hours"]
data = pd.DataFrame(columns=column_names)

stations = get_names("stations.txt")
#count = 0

for station in stations:
    data = add_station(station, data)
    print("successfully added:", station)
    
    #count += 1
    #if count > 2:
    #    break
data.shape

successfully added: aberporth
successfully added: armagh
successfully added: ballypatrick
successfully added: bradford
successfully added: braemar
successfully added: camborne
successfully added: cambridge
successfully added: cardiff
successfully added: chivenor
cwmystwyth ['cwmystwyth', 'Site', 'closed']
successfully added: cwmystwyth
successfully added: dunstaffnage
successfully added: durham
successfully added: eastbourne
successfully added: eskdalemuir
successfully added: heathrow
successfully added: hurn
successfully added: lerwick
successfully added: leuchars
lowestoft ['lowestoft', '1945', '3', '11.8', '4.1', '1', '35.8']
successfully added: lowestoft
successfully added: manston
successfully added: nairn
successfully added: newtonrigg
successfully added: oxford
successfully added: paisley
ringway ['ringway', 'Site', 'Closed']
successfully added: ringway
successfully added: rossonwye
successfully added: shawbury
successfully added: sheffield
southampton ['southampton', 'Site', 'C

successfully added: whitby
successfully added: wickairport
successfully added: yeovilton


(37086, 8)

Issues observed:
- no sun_hours data entries for some months in whitby.
- no sun_hours data entry for lowesoft 1945 march.
- cwmystwyth, ringway and southampton sites closed at some point.

about 37,000 data points in total

Need to clean and add skipped data points to database seperately

In [49]:
data = data.reindex()
data.head(10)

Unnamed: 0,station_name,year,month,tmax_degC,tmin_degC,af_days,rain_mm,sun_hours
0,aberporth,1941,2,---,---,---,69.1,---
0,aberporth,1941,3,---,---,---,76.2,---
0,aberporth,1941,4,---,---,---,33.7,---
0,aberporth,1941,5,---,---,---,51.3,---
0,aberporth,1941,6,---,---,---,25.7,---
0,aberporth,1941,7,---,---,---,53.9,---
0,aberporth,1941,8,---,---,---,91.8,---
0,aberporth,1941,9,---,---,---,25.5,---
0,aberporth,1941,10,---,---,---,106.2,---
0,aberporth,1941,11,---,---,---,92.3,---


In [50]:
data.tail(10)

Unnamed: 0,station_name,year,month,tmax_degC,tmin_degC,af_days,rain_mm,sun_hours
0,yeovilton,2018,7,25.9,13.5,0,13.8,242.3#
0,yeovilton,2018,8,22.7,12.8,0,53.8,138.5#
0,yeovilton,2018,9,19.6,9.4,0,36.4,149.0#
0,yeovilton,2018,10,15.4,6.2,4,51.8,132.8#
0,yeovilton,2018,11,12.1,5.3,4,102.6,73.8#
0,yeovilton,2018,12,11.0,5.3,2,94.2,27.2#
0,yeovilton,2019,1,7.8,1.0,14,33.8,53.2#
0,yeovilton,2019,2,11.8,1.9,10,47.2,111.4#
0,yeovilton,2019,3,12.3,4.8,2,66.0,110.0#
0,yeovilton,2019,4,14.8,4.7,2,49.4,155.2#


## Store data in a mongodb database server

In [45]:
from pymongo import MongoClient

In [51]:
def make_lineObj(line, data):
    
    #TBD cast to correct data types before storing in database
    return {
        "station_name" : data.station_name.iloc[line],
        "year" : data.year.iloc[line],
        "month" : data.month.iloc[line],
        "tmax_degC" : data.tmax_degC.iloc[line],
        "tmin_degC" : data.tmin_degC.iloc[line],
        "af_days" : data.af_days.iloc[line],
        "rain_mm" : data.rain_mm.iloc[line],
        "sun_hours" : data.sun_hours.iloc[line] 
    }

In [58]:
def storeObj(obj):
    collection = "weather_data"
    
    client = MongoClient(host="localhost", port=27017)
    db = client.pitds_weather_data
    weather_data = db[collection]
    
    weather_data.insert_one(obj)
    

In [59]:
for row in range(0, len(data)):
    storeObj(make_lineObj(row, data))
    
    # report every 100 entries
    if row%1000 == 0:
        print("stored entries:", row)

stored entries: 0
stored entries: 100
stored entries: 200
stored entries: 300
stored entries: 400
stored entries: 500
stored entries: 600
stored entries: 700
stored entries: 800
stored entries: 900
stored entries: 1000
stored entries: 1100
stored entries: 1200
stored entries: 1300
stored entries: 1400
stored entries: 1500
stored entries: 1600
stored entries: 1700
stored entries: 1800
stored entries: 1900
stored entries: 2000
stored entries: 2100
stored entries: 2200
stored entries: 2300
stored entries: 2400
stored entries: 2500
stored entries: 2600
stored entries: 2700
stored entries: 2800
stored entries: 2900
stored entries: 3000
stored entries: 3100
stored entries: 3200
stored entries: 3300
stored entries: 3400
stored entries: 3500
stored entries: 3600
stored entries: 3700
stored entries: 3800
stored entries: 3900
stored entries: 4000
stored entries: 4100
stored entries: 4200
stored entries: 4300
stored entries: 4400
stored entries: 4500
stored entries: 4600
stored entries: 4700
stor