<a href="https://colab.research.google.com/github/maxrgnt/pythdc2-project2/blob/master/Clean2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Panel Data
import pandas as pd
# System folders
import os
from pathlib import Path

In [2]:
def abbreviate(stateName):
    abrvDict = {'Alaska':'AL',
                'Arizona':'AZ',
                'California':'CA',
                'Idaho':'ID',
                'Maine':'ME',
                'Michigan':'MI',
                'Minnesota':'MN',
                'Montana':'MT',
                'New Mexico':'NM',
                'New York':'NY',
                'North Dakota':'ND',
                'Ohio':'OH',
                'Texas':'TX',
                'Vermont':'VT',
                'Washington':'WA'}
    abrv = ''
    if stateName in abrvDict:
        abrv = abrvDict[stateName]
    return abrv

def safeDrop(df, cols):
    print(df.shape)
    for col in df.columns:
        if col in cols:
            print(f'removing: {col}')
            df.drop([col], axis=1, inplace=True)
    print(df.shape)
    return df

## Border Data

In [26]:
dataPath = Path.joinpath(Path.cwd(),'data','borderCrossing.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,Abrv,State,Longitude,Latitude,Border,Date,Measure,Value
68230,WA,Washington,-118.22,49.0,US-Canada Border,5/1/2005 0:00,Bus Passengers,88
83296,MN,Minnesota,-93.4,48.61,US-Canada Border,6/1/2002 0:00,Pedestrians,3143
6628,ND,North Dakota,-100.56,49.0,US-Canada Border,2/1/2017 0:00,Personal Vehicle Passengers,2012


In [27]:
df['Measure'].value_counts()

Personal Vehicle Passengers    30196
Bus Passengers                 28820
Pedestrians                    28697
Train Passengers               27623
Name: Measure, dtype: int64

In [28]:
# Only interested in Passenger / Pedestrian crossings
people = df['Measure'].str.contains('Passengers|Pedestrians', case = False)

In [29]:
# Check to see how much data frame shrinks after filtering down
print(f'All measures: {df.shape}')
#df = df[people] # This gave index error when re-running, changed to df.loc
df = df.loc[people]
print(f'Just people: {df.shape}')

All measures: (115336, 8)
Just people: (115336, 8)


In [30]:
# Break out Location into latitude and longitude,
                # substring from 'POINT( ' to ')' and split on ' ' grabbing first then second element set as float,
if 'Location' in df.columns:
    df['Latitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[1].astype(float)
    df['Longitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[0].astype(float)

In [31]:
# Drop unnecessary columns
df = safeDrop(df, ['Port Code','Port Name','Location','Unnamed: 0','index'])

(115336, 8)
(115336, 8)


In [32]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [33]:
# Handle the Date column
df['newDate'] = pd.to_datetime(df['Date'])
df['Year'] = df['newDate'].dt.year.astype(int)

In [34]:
# reorganize columns
df = df[['Abrv','State','Longitude','Latitude','Border','Year','Measure','Value']]

In [35]:
df.head(1)

Unnamed: 0,Abrv,State,Longitude,Latitude,Border,Year,Measure,Value
0,NY,New York,-73.44253,44.99001,US-Canada Border,2019,Personal Vehicle Passengers,16377


In [36]:
# Remove non-pedestrian values to shrink file
df.to_csv(Path.joinpath(Path.cwd(),'data','borderCrossing.csv'), index = False)

## GDP Data

In [13]:
dataPath = Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,State,Abrv,Year,Value
120,Alaska,AL,1972,8.0
94,Maine,ME,1970,7.2
568,Vermont,VT,2001,5.3


In [14]:
# rename GeoName to State
df.rename(columns={'GeoName':'State'}, inplace=True)

In [15]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [16]:
# drop unneeded rows\n",
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [17]:
# SAFE DROP
df = safeDrop(df, ['GeoFips'])

(825, 4)
(825, 4)


In [18]:
if 'Year' not in df.columns:
  df = pd.melt(df, id_vars=['State','Abrv'], var_name='Year', value_name = 'Value')

In [19]:
df.head()

Unnamed: 0,State,Abrv,Year,Value
0,Alaska,AL,1964,13.6
1,Arizona,AZ,1964,7.5
2,California,CA,1964,8.4
3,Idaho,ID,1964,5.7
4,Maine,ME,1964,7.6


In [0]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv'), index = False)

## Unemployment Data

In [27]:
dataPath = Path.joinpath(Path.cwd(),'data','unemployment.csv')
df = pd.read_csv(dataPath)
df.sample(3)

Unnamed: 0,State,Year,Population,LaborForce,Employed,Unemployed,Abrv
54,New York,1979,13278917.0,7997062.0,7424194.0,572868.0,NY
231,Minnesota,1991,3308424.0,2427606.0,2302626.0,124980.0,MN
90,Alaska,1982,292417.0,212806.0,191699.0,21107.0,AL


In [28]:
# rename GeoName to State
df.rename(columns={'Stata':'State'}, inplace=True)

In [29]:
# get abbreviations
df['Abrv'] = df['State'].apply(abbreviate)
# drop unneeded rows
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [30]:
for col in ['Unemployed','Employed','LaborForce','Population']:
    if col in list(df.columns):
        df['UnemploymentRate'] = df['Unemployed'].div(df['Employed'])
        df['LaborRate'] = df['LaborForce'].div(df['Population'])

In [33]:
# drop unneeded columns
df = safeDrop(df, ['FIPS','PercentOfPopulation','PercentOfLaborEmp','PercentOfLaborUnemp','Population','LaborForce','Employed','Unemployed'])

(645, 7)
removing: Population
removing: LaborForce
(645, 5)


In [34]:
df.sample(3)

Unnamed: 0,State,Year,Abrv,UnemploymentRate,LaborRate
327,Texas,1997,TX,0.05606,0.691369
271,Arizona,1994,AZ,0.065499,0.654597
354,New York,1999,NY,0.054535,0.630585


In [35]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','unemployment.csv'), index = False)