##Supervised Learning

The purpose of this section is to build a KNN classifier using the clustering information we previously generated, then validate the classifier using similar data fom another country (France).

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import csv
from time import time

####Import US data

In [3]:
clusters = pd.read_csv('dfClusters_kMeans.csv')
us_year = pd.read_csv('YearDat.csv')
us_season = pd.read_csv('SeasonDat.csv')
us_month = pd.read_csv('MonthDat.csv')

####Import French data

In [4]:
france_year = pd.read_csv('FranceYearDat.csv')
france_season = pd.read_csv('FranceSeasonDat.csv')
france_month = pd.read_csv('FranceMonthDat.csv')

####Create dataframe with additional data (other than 6 pollutants) for clustered CBSAs

In [16]:
us_year.rename(columns={'CBSA Name': 'CBSA'}, inplace=True)
clustersonly = clusters[['CBSA','Cluster']]

us_yr_clusters = pd.concatenate(clustersonly, us_year, on='CBSA')

AttributeError: 'module' object has no attribute 'concatenate'

In [6]:
us_yr_clusters.head()

Unnamed: 0,CBSA,Cluster,Latitude,Longitude,PM25_FRMFEM,AQI_PM25_FRMFEM,PM25_NOFRMFEM,AQI_PM25_NOFRMFEM,PM10,AQI_PM10,...,pop_weight2010,alone,alone_frac,carpool,carpool_frac,other,other_frac,pt,pt_frac,total_trans
0,"Baton Rouge, LA",0,30.426074,-91.197545,10.523564,41.817916,10.600909,40.076649,27.801724,25.597701,...,1603.3,305593,0.83271,37259,0.101527,20226,0.055114,3908,0.010649,366986
1,"Chicago-Naperville-Joliet, IL-IN-WI",0,41.793944,-87.583018,12.822971,46.264507,12.289887,46.061675,23.560028,21.712074,...,8613.4,3130329,0.710939,379176,0.086116,395059,0.089723,498529,0.113222,4403093
2,"Davenport-Moline-Rock Island, IA-IL",0,41.525174,-90.600853,12.234181,45.526958,11.977756,44.955169,32.308116,28.661623,...,2218.6,154028,0.850861,14171,0.078282,11310,0.062477,1517,0.00838,181026
3,"Denver-Aurora, CO",0,39.70886,-104.968822,7.398333,30.070455,8.764516,34.722581,26.029324,24.059642,...,4803.7,977259,0.759138,118034,0.091689,133310,0.103556,58725,0.045618,1287328
4,"El Centro, CA",0,32.786175,-115.520073,9.472488,36.763158,10.295082,40.491803,32.501661,29.747508,...,3062.6,44451,0.792932,5815,0.10373,5090,0.090797,703,0.01254,56059


In [7]:
us_yr_clusters.columns

Index([u'CBSA', u'Cluster', u'Latitude', u'Longitude', u'PM25_FRMFEM',
       u'AQI_PM25_FRMFEM', u'PM25_NOFRMFEM', u'AQI_PM25_NOFRMFEM', u'PM10',
       u'AQI_PM10', u'TEMP', u'PRESS', u'RH', u'DP', u'WIND', u'CO', u'AQI_CO',
       u'NO2', u'AQI_NO2', u'OZONE', u'AQI_OZONE', u'SO2', u'AQI_SO2', u'id',
       u'MetArea', u'pop2010', u'landarea', u'popdense2010', u'pop_weight2010',
       u'alone', u'alone_frac', u'carpool', u'carpool_frac', u'other',
       u'other_frac', u'pt', u'pt_frac', u'total_trans'],
      dtype='object')

In [10]:
france_year.columns

Index([u'CO', u'NO2', u'OZONE', u'PM10', u'PM25', u'SO2', u'city', u'id',
       u'population', u'popdense', u'area', u'other_frac', u'car_frac',
       u'pt_frac', u'Car', u'PT', u'Other', u'Precip', u'Pressure', u'RH',
       u'Site', u'Temp', u'Wind'],
      dtype='object')

In [11]:
france_year.head()

Unnamed: 0,CO,NO2,OZONE,PM10,PM25,SO2,city,id,population,popdense,...,pt_frac,Car,PT,Other,Precip,Pressure,RH,Site,Temp,Wind
0,0.264047,13.533663,0.022437,24.65925,17.603,0.62799,Nancy,FR30036,434479,613.363951,...,0.12,221584.29,52137.48,160757.23,734.1,1014.182698,,071800-99999-2010,49.645806,5.602288
1,0.291703,13.180851,0.025806,22.641333,16.303,0.374618,Caen,FR21001,403633,667.231583,...,,,,,709.9,1014.60593,,070270-99999-2010,49.413654,7.613426
2,,10.551064,0.022737,25.435,,0.246565,Charleville-Mezieres,FR14051,106835,334.30571,...,,,,,796.0,1014.537853,,070750-99999-2010,48.359738,4.522086
3,,13.289894,0.025374,,19.359,1.301908,Valence,FR36002,175636,253.75791,...,,,,,899.9,1013.618678,,075770-99999-2010,55.34566,6.962982
4,,9.625,0.028092,21.006,,,Cholet,FR23078,104917,483.925584,...,,,,,,,,,,


In [12]:
clusters.columns

Index([u'CBSA', u'CO', u'Cluster', u'NO2', u'OZONE', u'PM10', u'PM25_FRMFEM',
       u'SO2'],
      dtype='object')

Some of the French columns have different names compared to the US columns. Rename these.

In [13]:
france_year.rename(columns={'population':'pop2010', 'popdense':'popdense2010', 'area':'landarea', 
                           'car_frac':'carpool_frac', 'Car':'carpool', 'PT':'pt', 'Pressure':'PRESS',  
                           'Wind':'WIND', 'Temp':'TEMP'}, inplace=True)

####Compute similarity using Pearson correlation within clusters

In [14]:
from scipy.stats.stats import pearsonr
def pearsonr(train, test):
    traincols = [train['TEMP'], train['PRESS'], train['RH'], train['WIND'], train['pop2010'], train['landarea'],
                 train['popdense2010'], train['carpool_frac'], train['other_frac'], train['pt_frac']]
    testcols = [test['TEMP'], test['PRESS'], test['RH'], test['WIND'], test['pop2010'], test['landarea'],
                test['popdense2010'], test['carpool_frac'], test['other_frac'], test['pt_frac']]
    sim = pearsonr(traincols, testcols)[0]
    return sim

####Get similarity measure for each combination of French and US cities

In [15]:
df = []
for ind, frenchcity in france_year.iterrows():
    for ind, uscity in us_yr_clusters.iterrows():
        sim = pearsonr(uscity, frenchcity)
        fname = frenchcity['city']
        usname = uscity['CBSA']
        a = {'French City':fname, 'US CBSA':usname, 'pearson sim':sim}
        df.append(a)

TypeError: list indices must be integers, not str

In [None]:
df_sims = pd.DataFrame(df, columns=('French City', 'US CBSA', 'pearson sim'))

In [None]:
df_sims.shape

####Get the k nearest US neighbors of the French cities

In [None]:
from operator import itemgetter
def knearest(frenchcity, uscities, dbase, k=5):
    neighbors=[]
    for uscity in uscities:
        sim = dbase.get(frenchcity, uscity)
        simdist=(1. - sim)/2. 
        neighbors.append((uscity, simdist))
    neighbors=sorted(neighbors, key = operator.itemgetter(1)) 
    return neighbors[0:k]

####Predict cluster based on k nearest neighbors

In [None]:
def frenchclust(neighbors):
    votes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in votes:
            votes[response] += 1
        else:
            votes[response] = 1
    sortvotes = sorted(votes.iteritems(), key = operator.itemgetter(1), reverse=True)
    return sortedvotes[0][0]

In [None]:
for frenchcity in franceyear.iterrows():
    f_neighbors = knearest(frenchcity, us_yr_clusters['CBSA'], df_sims, k=5)
    f_clusters = frenchclust(f_neighbors)
    french_clusts = dict(zip(frenchcity, f_clusters))

####Examine profiles of French clusters

In [None]:
french_clusts

In [None]:
for x in len(range(3))
if french_clusts[]