# Practicing with merge

In [1]:
import numpy as np
import pandas as pd

In [2]:
sample_school = pd.DataFrame({'BEDS Code':[0,1,2,3,4,5,1,2,3,1],
                              'Other Field': list('abcdefghij'),
                              'Another Field':[10,11,12,13,14,15]+[np.nan]*4})
sample_latlon = pd.DataFrame({'SED CODE':[1,2,3], 'latlon':['one','two','three']})

new = pd.merge(sample_school, sample_latlon, left_on=['BEDS Code'],right_on=['SED CODE'], how='left')
new

Unnamed: 0,Another Field,BEDS Code,Other Field,SED CODE,latlon
0,10.0,0,a,,
1,11.0,1,b,1.0,one
2,12.0,2,c,2.0,two
3,13.0,3,d,3.0,three
4,14.0,4,e,,
5,15.0,5,f,,
6,,1,g,1.0,one
7,,2,h,2.0,two
8,,3,i,3.0,three
9,,1,j,1.0,one


In [3]:
# Take a look at the resulting data/missing values
print('... {} unique schools,'.format(len(new['BEDS Code'].unique())))
schools_missing_latlon = new[new['latlon'].isnull()]['BEDS Code'].unique()
print('... of which {} are missing lat/long'.format(len(schools_missing_latlon)))

... 6 unique schools,
... of which 3 are missing lat/long


# Practicing with fillling na
GOAL: to use this to fill in missing boroughs

In [4]:
# create dictionary for filling 'Another Field'
d = new[new['Another Field'].notnull()][['BEDS Code','Another Field']].to_dict()
d

{'Another Field': {0: 10.0, 1: 11.0, 2: 12.0, 3: 13.0, 4: 14.0, 5: 15.0},
 'BEDS Code': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}}

In [5]:
map_vals = {d['BEDS Code'][idx]: d['Another Field'][idx] for idx in d['Another Field'].keys()}
map_vals

{0: 10.0, 1: 11.0, 2: 12.0, 3: 13.0, 4: 14.0, 5: 15.0}

In [6]:
new['A Third Field'] = new['BEDS Code'].map(map_vals)
#new

In [7]:
# What if the map dict had been missing values?
other_map = {d['BEDS Code'][idx]: d['Another Field'][idx] for idx in d['Another Field'].keys()}
del other_map[1]
other_map

{0: 10.0, 2: 12.0, 3: 13.0, 4: 14.0, 5: 15.0}

In [8]:
# I see ... it will leave NaNs
new['A Fourth Field'] = new['BEDS Code'].map(other_map)
#new

In [9]:
# can I map w/out the dictionary. SWEEEET
new['A Fifth Field'] = new['BEDS Code'].map(new['Another Field'])
new

Unnamed: 0,Another Field,BEDS Code,Other Field,SED CODE,latlon,A Third Field,A Fourth Field,A Fifth Field
0,10.0,0,a,,,10,10.0,10
1,11.0,1,b,1.0,one,11,,11
2,12.0,2,c,2.0,two,12,12.0,12
3,13.0,3,d,3.0,three,13,13.0,13
4,14.0,4,e,,,14,14.0,14
5,15.0,5,f,,,15,15.0,15
6,,1,g,1.0,one,11,,11
7,,2,h,2.0,two,12,12.0,12
8,,3,i,3.0,three,13,13.0,13
9,,1,j,1.0,one,11,,11


## Practicing for distance join

In [10]:
# Creating mock data frames
school_mdf = pd.DataFrame({'BEDS':[1,2,3,4,5]*2,
                           'Year': [2010]*5 + [2011]*5,
                           'Loc':[10,11,12,13,14]*2})
felony_mdf = pd.DataFrame({'CrimeID':[l+n for l in 'abcd' for n in '1234'],
                           'Year':[2010, 2011]*8,
                           'Loc':[11,11,12,13,13,14,10,14,12,11,11,10,14,12,13,12],
                           'Tally':[1]*16})

In [11]:
# to minimize searching we'll identify location by year
grouped = school_mdf.groupby(['BEDS','Year'])

In [12]:
school_mdf['CrimeIDS'] = pd.Series()
school_mdf['CrimeTally'] = pd.Series()

# Loop through groups to get list of crimes
# This is super inefficient because the location screen will have to run through all the 
# crimes for each BEDS code. ugh.
for name, df in grouped:
    code, year = name # unpacking from groupby 
    location = df.Loc.mean() # they're all the same so the mean is just the location
    #... actually we could really just group by location since its equivlent to beds...?
    
    # get felonies for that year and w/ correct location
    felonies = felony_mdf[(felony_mdf.Year == year) & (felony_mdf.Loc == location)]
    
    # store crime ids and tally in school DF
    school_mdf.loc[grouped.groups[name],['CrimeIDS']] = str(felonies.CrimeID.unique().tolist())
    school_mdf.loc[grouped.groups[name],['CrimeTally']] = felonies['Tally'].sum()
    #df['CrimeIDS'] = str(felonies['CrimeID'].unique().tolist()) # THROWS Subsetting ERROR
    #df['CrimeTally'] = felonies['Tally'].sum() # THROWS Subsetting ERROR

school_mdf

Unnamed: 0,BEDS,Loc,Year,CrimeIDS,CrimeTally
0,1,10,2010,['b3'],1
1,2,11,2010,"['a1', 'c3']",2
2,3,12,2010,"['a3', 'c1']",2
3,4,13,2010,"['b1', 'd3']",2
4,5,14,2010,['d1'],1
5,1,10,2011,['c4'],1
6,2,11,2011,"['a2', 'c2']",2
7,3,12,2011,"['d2', 'd4']",2
8,4,13,2011,['a4'],1
9,5,14,2011,"['b2', 'b4']",2


### Practicing update method for DF

In [15]:
counts = pd.Series({'GRAND LARCENY':0, 'ROBBERY':0, 'BURGLARY':0, 'FELONY ASSAULT':0,
                      'RAPE':0, 'GRAND LARCENY OF MOTOR VEHICLE':0,'MURDER & NON-NEGL. MANSLAUGHTE':0})

In [18]:
type(school_mdf.BEDS.value_counts())

pandas.core.series.Series

In [19]:
other_counts = pd.Series({'GRAND LARCENY':2, 'ROBBERY':2, 'BURGLARY':2})

In [20]:
counts.update(other_counts)

In [21]:
counts

BURGLARY                          2
FELONY ASSAULT                    0
GRAND LARCENY                     2
GRAND LARCENY OF MOTOR VEHICLE    0
MURDER & NON-NEGL. MANSLAUGHTE    0
RAPE                              0
ROBBERY                           2
dtype: int64

In [22]:
# is it ok to update with an empty dataframe?
counts.update(pd.Series())

In [23]:
# how about running value counts on an empty df
pd.Series().value_counts()

Series([], dtype: int64)