<a href="https://colab.research.google.com/github/maxrgnt/pythdc2-project2/blob/master/cleanBorderData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Panel Data
import pandas as pd
# # System folders
import os
from pathlib import Path

In [3]:
def abbreviate(stateName):
    abrvDict = {'Alaska':'AL',
               'Arizona':'AZ',
               'California':'CA',
               'Idaho':'ID',
               'Maine':'ME',
               'Michigan':'MI',
               'Minnesota':'MN',
               'Montana':'MT',
               'New Mexico':'NM',
               'New York':'NY',
               'North Dakota':'ND',
               'Ohio':'OH',
               'Texas':'TX',
               'Vermont':'VT',
               'Washington':'WA'}
    abrv = ''
    if stateName in abrvDict:
        abrv = abrvDict[stateName]
    return abrv

def safeDrop(df, cols):
    print(df.shape)
    for col in df.columns:
        if col in cols:
            print(f'removing: {col}')
            df.drop([col], axis=1, inplace=True)
    print(df.shape)
    return df

## Border Data

In [217]:
dataPath = Path.joinpath(Path.cwd(),'data','borderCrossing.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,Abrv,State,Longitude,Latitude,Border,Date,Measure,Value
63294,MN,Minnesota,-94.59,48.72,US-Canada Border,4/1/2006 0:00,Train Passengers,313
51520,TX,Texas,-99.51,27.5,US-Mexico Border,6/1/2008 0:00,Personal Vehicle Passengers,1207569
48163,AZ,Arizona,-112.82,31.88,US-Mexico Border,2/1/2009 0:00,Personal Vehicle Passengers,74478


In [208]:
df['Measure'].value_counts()

Personal Vehicle Passengers    30196
Bus Passengers                 28820
Pedestrians                    28697
Train Passengers               27623
Name: Measure, dtype: int64

In [209]:
# Only interested in Passenger / Pedestrian crossings
people = df['Measure'].str.contains('Passengers|Pedestrians', case = False)

In [210]:
# Check to see how much data frame shrinks after filtering down
print(f'All measures: {df.shape}')
#df = df[people] # This gave index error when re-running, changed to df.loc
df = df.loc[people]
print(f'Just people: {df.shape}')

All measures: (115336, 9)
Just people: (115336, 9)


In [211]:
# Break out Location into latitude and longitude
                # substring from 'POINT( ' to ')' and split on ' ' grabbing first then second element set as float
if 'Location' in df.columns:
    df['Latitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[1].astype(float)
    df['Longitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[0].astype(float)

In [212]:
# Drop unnecessary columns 
df = safeDrop(df, ['Port Code','Port Name','Location','Unnamed: 0','index'])

(115336, 9)
removing: Unnamed: 0
(115336, 8)


In [213]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [214]:
# reorganize columns
df = df[['Abrv','State','Longitude','Latitude','Border','Date','Measure','Value']]

In [215]:
df.head()

Unnamed: 0,Abrv,State,Longitude,Latitude,Border,Date,Measure,Value
0,NY,New York,-73.44253,44.99001,US-Canada Border,3/1/2019 0:00,Personal Vehicle Passengers,16377
1,ND,North Dakota,-97.24333,48.96639,US-Canada Border,3/1/2019 0:00,Bus Passengers,1054
2,ND,North Dakota,-98.99457,48.94105,US-Canada Border,3/1/2019 0:00,Personal Vehicle Passengers,509
3,WA,Washington,-122.44316,48.94802,US-Canada Border,3/1/2019 0:00,Pedestrians,79
4,ME,Maine,-67.42955,45.55984,US-Canada Border,3/1/2019 0:00,Pedestrians,3


In [216]:
# Remove non-pedestrian values to shrink file
df.to_csv(Path.joinpath(Path.cwd(),'data','borderCrossing.csv'), index = False)

## GDP Data

In [234]:
dataPath = Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,State,Abrv,Year,Value
39,New York,NY,1966,7.7
223,Vermont,VT,1978,17.8
87,Texas,TX,1969,9.2


In [220]:
# rename GeoName to State
df.rename(columns={'GeoName':'State'}, inplace=True)

In [221]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [225]:
# drop unneeded rows
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [226]:
# SAFE DROP
df = safeDrop(df, ['GeoFips'])

(15, 57)
(15, 57)


In [235]:
if 'Year' not in df.columns:
    df = pd.melt(df, id_vars=['State','Abrv'], var_name='Year', value_name = 'Value')

In [236]:
df.head()

Unnamed: 0,State,Abrv,Year,Value
0,Alaska,AL,1964,13.6
1,Arizona,AZ,1964,7.5
2,California,CA,1964,8.4
3,Idaho,ID,1964,5.7
4,Maine,ME,1964,7.6


In [233]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv'), index = False)

## Unemployment Data

In [4]:
dataPath = Path.joinpath(Path.cwd(),'data','unemployment.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,FIPS,Stata,Year,Population,LaborForce,PercentOfPopulation,Employed,PercentOfLaborEmp,Unemployed,PercentOfLaborUnemp
1998,39,Ohio,2013,9057552.0,5716730.0,63.1,5290609.0,58.4,426121.0,7.5
2119,56,Wyoming,2015,450698.0,304403.0,67.5,291295.0,64.6,13108.0,4.3
545,18,Indiana,1986,4120000.0,2748410.0,66.7,2560101.0,62.1,188309.0,6.9


In [5]:
# rename GeoName to State
df.rename(columns={'Stata':'State'}, inplace=True)

In [11]:
# drop unneeded columns
df = safeDrop(df, ['FIPS','PercentOfPopulation','PercentOfLaborEmp','PercentOfLaborUnemp'])

(2279, 6)
(2279, 6)


In [17]:
# get abbreviations
df['Abrv'] = df['State'].apply(abbreviate)
# drop unneeded rows
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [18]:
df.sample(3)

Unnamed: 0,State,Year,Population,LaborForce,Employed,Unemployed,Abrv
2152,New Mexico,2016,1618355.0,936348.0,874424.0,61924.0,NM
1003,Washington,1994,4036254.0,2739472.0,2559932.0,179540.0,WA
172,Idaho,1979,647917.0,429329.0,404370.0,24959.0,ID


In [19]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','unemployment.csv'), index = False)