In [21]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('input/data_en.csv',
                 parse_dates=['Submission Date'],
                 dtype={
                     'Price': np.float64,
                     'Quantity (grams)': np.float64
                 })

In [3]:
df.head()

Unnamed: 0,Submission Date,Price,Quantity (grams),Quality,City (Census subdivision),Province,Primary Reason,Usage,Average Monthly Consumption (grams)
0,2018-01-26,170.0,28.0,High,conception bay south,NL,Medicate without a medical document,Daily,28.0
1,2018-01-02,85.0,28.0,High,portugal cove-st. philip's,NL,Use recreationally,Daily,112.0
2,2018-01-02,20.0,3.5,High,portugal cove-st. philip's,NL,Use recreationally,Daily,0.0
3,2018-01-25,15.0,1.0,Medium,torbay,NL,Medicate with a medical document,A few times per week,28.0
4,2018-02-02,15.0,1.0,High,torbay,NL,Use recreationally,A few times per week,28.0


In [4]:
def price_per_gram(price, grams):
    if price == 0 or grams == 0:
        return np.nan
    
    return price / grams

df['Price Per Gram'] = df.apply(
    lambda row: price_per_gram(row['Price'], row['Quantity (grams)']), axis=1)

In [5]:
df.tail()

Unnamed: 0,Submission Date,Price,Quantity (grams),Quality,City (Census subdivision),Province,Primary Reason,Usage,Average Monthly Consumption (grams),Price Per Gram
15610,2018-01-31,150.0,14.0,High,yellowknife,NT,Use recreationally,A few times per week,0.5,10.714286
15611,2018-02-02,10.0,1.0,High,yellowknife,NT,Medicate without a medical document,,0.0,10.0
15612,2018-01-30,100.0,7.0,Medium,iqaluit,NU,Use recreationally,Daily,28.0,14.285714
15613,2018-01-30,320.0,28.0,Medium,iqaluit,NU,Use recreationally,Daily,14.0,11.428571
15614,2018-01-25,35.0,7.0,Medium,kugluktuk,NU,Use recreationally,,0.0,5.0


In [6]:
df_groupby_province = df.groupby(['Province'])['Price Per Gram'].mean()

df_groupby_province

Province
AB     7.258903
BC     6.955510
MB     7.235913
NB     6.381788
NL     7.896240
NS     7.117300
NT    11.886364
NU    10.238095
ON     7.324025
PE     7.029095
QC     5.877720
SK     7.440028
YT     8.054545
Name: Price Per Gram, dtype: float64

In [7]:
df_groupby_city = df.groupby(['Province', 'City (Census subdivision)'])['Price Per Gram'].mean()
df_groupby_city.sort_values()

Province  City (Census subdivision)
ON        brant                         2.000000
QC        sainte-anne-des-plaines       2.500000
BC        salmo                         2.857143
QC        kazabazua                     2.857143
NB        richibucto                    2.857143
QC        saint-michel-du-squatec       2.857143
          east angus                    2.857143
          rivi�re-rouge                 2.857143
          sainte-victoire-de-sorel      2.857143
          saint-andr�-d'argenteuil      2.857143
          havelock                      2.857143
NB        hillsborough                  2.966667
QC        val-alain                     3.000000
          saint-mathieu-de-beloeil      3.214286
BC        nakusp                        3.392857
ON        strathroy-caradoc             3.455357
QC        val-des-bois                  3.570000
          pontiac                       3.571429
BC        new hazelton                  3.571429
QC        saint-�phrem-de-beauce 

In [8]:
df_groupby_city.to_csv('output/data-group-by-city.csv')

In [9]:
from mapbox import Geocoder

from keys import MAPBOX_ACCESS_TOKEN

geocoder = Geocoder(access_token=MAPBOX_ACCESS_TOKEN)

In [10]:
r = geocoder.forward('moosonee, QC', limit=1, country=['ca'])

data = r.json()

In [11]:
lat, lng = data['features'][0]['geometry']['coordinates']

In [13]:
cache = {}

In [17]:
def city_to_coordinates(city, province):
    if cache.get('{0}, {1}'.format(city, province)):
        return cache.get('{0}, {1}'.format(city, province))
    
    resp = geocoder.forward('{0}, {1}'.format(city, province), limit=1, country=['ca'])
    
    if not resp.ok:
        return np.nan
    
    try:
        cache['{0}, {1}'.format(city, province)] = ['features'][0]['geometry']['coordinates']
        return ['features'][0]['geometry']['coordinates']
    except:
        return np.nan
    

df['Coordinates'] = df.apply(
    lambda row: city_to_coordinates(
            row['City (Census subdivision)'], row['Province']), axis=1)

In [18]:
df.head()

Unnamed: 0,Submission Date,Price,Quantity (grams),Quality,City (Census subdivision),Province,Primary Reason,Usage,Average Monthly Consumption (grams),Price Per Gram,Coordinates
0,2018-01-26,170.0,28.0,High,conception bay south,NL,Medicate without a medical document,Daily,28.0,6.071429,
1,2018-01-02,85.0,28.0,High,portugal cove-st. philip's,NL,Use recreationally,Daily,112.0,3.035714,
2,2018-01-02,20.0,3.5,High,portugal cove-st. philip's,NL,Use recreationally,Daily,0.0,5.714286,
3,2018-01-25,15.0,1.0,Medium,torbay,NL,Medicate with a medical document,A few times per week,28.0,15.0,
4,2018-02-02,15.0,1.0,High,torbay,NL,Use recreationally,A few times per week,28.0,15.0,


In [24]:
df[['Latitude', 'Longitude']] = df.apply(lambda row: row['Coordinates'])

KeyError: ('Coordinates', 'occurred at index Submission Date')

In [None]:
df.plot(kind='scatter', x='Longitude', y='Latitude', alpha=0.4)
plt.show()