In [3]:
from pymongo import MongoClient
import pandas as pd

def query_to_dataframe(query, database):
    '''
    custom function to create a dataframe of query results
    from the weather_data
    '''
    
    client = MongoClient(host="midgard09", port=27017)
    weather_data = client.pitds_weather_data[database]
    
    entries = []
    for entry in weather_data.find(query):
        entries.append(entry)
      
    if len(entries)==0: # no hits for the query
        return None
    else:
        return pd.DataFrame(entries).drop(columns=["_id"])

In [5]:
# Get happiness data
happiness = query_to_dataframe({}, database="happiness_data")

# Get 2018 weather data
weather_2018 = pd.read_excel("cleaned_2018_data.xlsx")

# Get station locations
stations = query_to_dataframe({}, database="station_cordinates")

In [38]:
happiness.head()

Unnamed: 0,area_codes,area_names,avg_rating,high_7_8,latitude,longitude,low_0_4,medium_5_6,sample_size,vhigh_9_10
0,E12000001,NORTH EAST,7.34,39.06,55.0,-1.9,10.77,17.05,11310,33.13
1,E12000002,NORTH WEST,7.39,39.1,54.0,-2.6,10.17,16.82,19460,33.91
2,E12000003,YORKSHIRE AND THE HUMBER,7.41,38.56,53.6,-1.2,10.21,16.6,13110,34.63
3,E12000004,EAST MIDLANDS,7.51,38.62,53.0,-0.8,8.66,16.74,7880,35.99
4,E12000005,WEST MIDLANDS,7.43,41.72,52.5,-2.3,8.61,17.59,11940,32.08


In [10]:
weather_2018.head()

Unnamed: 0,af_days,month,rain_mm,station_name,sun_hours,tmax_degC,tmin_degC,year
0,0,1,117.2,aberporth,53.5,8.4,3.8,2018
1,10,2,48.6,aberporth,91.3,6.3,1.3,2018
2,6,3,97.0,aberporth,100.6,7.7,2.3,2018
3,0,4,70.6,aberporth,152.2,12.1,6.4,2018
4,0,5,33.4,aberporth,219.4,15.3,8.1,2018


In [8]:
stations = stations.set_index("station").drop(columns=["label"])
stations.head()

Unnamed: 0_level_0,latitude,longitude
station,Unnamed: 1_level_1,Unnamed: 2_level_1
aberporth,52.139,-4.57
armagh,54.352,-6.649
ballypatrick,55.181,-6.153
bradford,53.813,-1.772
braemar,57.006,-3.396


### Join stations and happiness data

In [24]:
import numpy as np

def square(x):
    return x*x

def get_shortest_distance(a, b):
    x = b.longitude - a.longitude
    y = b.latitude - a.latitude
    
    xx = x.apply(func=square)
    yy = y.apply(func=square)
    
    return np.sqrt(xx+yy).sort_values().index[0]
     
    
def get_area_index(station, stations, areas):
    '''
    custom function to identify the area to which a given station belongs.
    The fucntion chooses the area that has the least straight line distance from
    the station. The straight line distance betwen a stationa an area is determined 
    using the coordiates of the station and the coordinate sof the area.
    
    param: station - the name of the station on interest
    param: stations - the names and locations of all stations
    param: areas - the names and locations of all stations
    
    return: area - the name of the closest area
    '''
    from_location = stations.loc[station]
    index = get_shortest_distance(a=from_location, b=areas)
    
    return areas.index[index]


def get_nearest_area(station, stations, happiness):
    
    h = happiness.copy(deep=True)
    areas = pd.concat([h.area_codes, h.latitude, h.longitude], axis=1)
    
    return h.area_codes.loc[get_area_index(station, stations, areas)]

In [29]:
get_nearest_area("ballypatrick", stations, happiness)

'S92000003'

In [34]:
areas = []
for station in stations.index:
    area = get_nearest_area(station, stations, happiness)
    areas.append(area)
    
station_area = pd.concat([pd.Series(stations.index), pd.Series(areas, name="area")], axis=1)
station_area

Unnamed: 0,station,area
0,aberporth,W92000004
1,armagh,S92000003
2,ballypatrick,S92000003
3,bradford,E12000003
4,braemar,S92000003
5,camborne,E12000009
6,cambridge,E12000006
7,cardiff,W92000004
8,chivenor,E12000009
9,cwmystwyth,W92000004


In [36]:
station_area.area.unique()

array(['W92000004', 'S92000003', 'E12000003', 'E12000009', 'E12000006',
       'E12000001', 'E12000007', 'E12000008', 'E12000002', 'E12000005',
       'E12000004'], dtype=object)