<a href="https://colab.research.google.com/github/maxrgnt/pythdc2-project2/blob/master/Clean2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Panel Data
import pandas as pd
# System folders
import os
from pathlib import Path

In [0]:
def abbreviate(stateName):
    abrvDict = {'Alaska':'AL',
                'Arizona':'AZ',
                'California':'CA',
                'Idaho':'ID',
                'Maine':'ME',
                'Michigan':'MI',
                'Minnesota':'MN',
                'Montana':'MT',
                'New Mexico':'NM',
                'New York':'NY',
                'North Dakota':'ND',
                'Ohio':'OH',
                'Texas':'TX',
                'Vermont':'VT',
                'Washington':'WA'}
    abrv = ''
    if stateName in abrvDict:
        abrv = abrvDict[stateName]
    return abrv

def safeDrop(df, cols):
    print(df.shape)
    for col in df.columns:
        if col in cols:
            print(f'removing: {col}')
            df.drop([col], axis=1, inplace=True)
    print(df.shape)
    return df

## Border Data

In [0]:
dataPath = Path.joinpath(Path.cwd(),'data','borderCrossing.csv'),
df = pd.read_csv(dataPath),
df.sample(3)

In [0]:
df['Measure'].value_counts()

In [0]:
# Only interested in Passenger / Pedestrian crossings
people = df['Measure'].str.contains('Passengers|Pedestrians', case = False)

In [0]:
# Check to see how much data frame shrinks after filtering down,
print(f'All measures: {df.shape}'),
#df = df[people] # This gave index error when re-running, changed to df.loc,
df = df.loc[people],
print(f'Just people: {df.shape}')

In [0]:
# Break out Location into latitude and longitude,
                # substring from 'POINT( ' to ')' and split on ' ' grabbing first then second element set as float,
if 'Location' in df.columns:,
    df['Latitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[1].astype(float),
    df['Longitude'] = df['Location'].str[len('POINT ('):-1].str.split(' ').str[0].astype(float)

In [0]:
# Drop unnecessary columns
df = safeDrop(df, ['Port Code','Port Name','Location','Unnamed: 0','index'])

In [0]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [0]:
# reorganize columns
df = df[['Abrv','State','Longitude','Latitude','Border','Date','Measure','Value']]

In [0]:
df.head(1)

In [0]:
# Remove non-pedestrian values to shrink file
df.to_csv(Path.joinpath(Path.cwd(),'data','borderCrossing.csv'), index = False)

## GDP Data

In [0]:
dataPath = Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv')
df = pd.read_csv(dataPath)
df.sample(3)

In [0]:
# rename GeoName to State
df.rename(columns={'GeoName':'State'}, inplace=True)

In [0]:
# Get state abrv
df['Abrv'] = df['State'].apply(abbreviate)

In [0]:
# drop unneeded rows\n",
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [0]:
# SAFE DROP
df = safeDrop(df, ['GeoFips'])

In [0]:
if 'Year' not in df.columns:
  df = pd.melt(df, id_vars=['State','Abrv'], var_name='Year', value_name = 'Value')

In [0]:
df.head()

In [0]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','pctChangeGDP.csv'), index = False)

## Unemployment Data

In [0]:
dataPath = Path.joinpath(Path.cwd(),'data','unemployment.csv')
df = pd.read_csv(dataPath)
df.sample(3)

In [0]:
# rename GeoName to State
df.rename(columns={'Stata':'State'}, inplace=True)

In [0]:
# drop unneeded columns
df = safeDrop(df, ['FIPS','PercentOfPopulation','PercentOfLaborEmp','PercentOfLaborUnemp'])

In [0]:
# get abbreviations
df['Abrv'] = df['State'].apply(abbreviate)
# drop unneeded rows
# index of all rows where df['Abrv'] == ''
dropIndex = df.loc[df['Abrv']==''].index
df.drop(dropIndex, inplace=True)

In [0]:
df.sample(3)

In [0]:
# Un-pivoting
df.to_csv(Path.joinpath(Path.cwd(),'data','unemployment.csv'), index = False)