### This script contains the following:
#### 1. Importing data and libraries
#### 2. Data wrangling
#### 3. Data cleaning
#### 4. Plotting a choropleth

### 1. Importing data and libraries

In [3]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [4]:
# This command prompts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [5]:
# Importing the  ".json" file for the U.S. 

states_geo = r'/Users/peterguan/Desktop/CareerFoundry/Data Immersion Achievement 6/JSON file U.S. States.json'

In [6]:
# Creating a path for importing/exporting
path = r'/Users/peterguan/Citi Bike Analysis'

In [7]:
# Importing the 'bike_cleaned' dataset
bike_cleaned = pd.read_csv(r'/Users/peterguan/Citi Bike Analysis/02 Data/Prepared Data/bike.csv', index_col = False)

In [8]:
# Checking the first 5 rows in the dataset
bike_cleaned.head()

Unnamed: 0.1,Unnamed: 0,trip_id,bike_id,weekday,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender,age
0,0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968,Female,45
1,1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983,Male,30
2,2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989,Male,24
3,3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988,Female,25
4,4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,1978,Unknown,35


In [9]:
# Checking the dimensions of the dataset
bike_cleaned.shape

(49972, 20)

### 2. Data wrangling 

In [103]:
# Selecting only the necessary columns and put them in a list called columns
columns = ['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude']

In [105]:
# Creating a subset with the columns list
state_rec = bike_cleaned[columns]

In [107]:
state_rec.head(5)

Unnamed: 0,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude
0,40.754666,-73.991382,40.742388,-73.997262
1,40.719392,-74.002472,40.728419,-73.98714
2,40.760193,-73.991255,40.768254,-73.988639
3,40.743156,-73.974347,40.756014,-73.967416
4,40.75045,-73.994811,40.743943,-73.979661


In [109]:
s2 = state_rec.idxmax(axis=1)

In [111]:
s2

0        start_station_latitude
1          end_station_latitude
2          end_station_latitude
3          end_station_latitude
4        start_station_latitude
                  ...          
49967      end_station_latitude
49968      end_station_latitude
49969      end_station_latitude
49970      end_station_latitude
49971    start_station_latitude
Length: 49972, dtype: object

In [113]:
state_rec['STATE_NAME'] = s2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_rec['STATE_NAME'] = s2


In [115]:
state_rec.columns

Index(['start_station_latitude', 'start_station_longitude',
       'end_station_latitude', 'end_station_longitude', 'STATE_NAME'],
      dtype='object')

### 3. Data Cleaning

In [117]:
# Checking for missing values

state_rec.isnull().sum()

start_station_latitude     0
start_station_longitude    0
end_station_latitude       0
end_station_longitude      0
STATE_NAME                 0
dtype: int64

In [119]:
# Checking for duplicates
dups = state_rec.duplicated()

In [121]:
# No duplicates found
dups.shape

(49972,)

In [123]:
# Checking the datatypes of the columns
state_rec.dtypes

start_station_latitude     float64
start_station_longitude    float64
end_station_latitude       float64
end_station_longitude      float64
STATE_NAME                  object
dtype: object

### 4. Plotting a choropleth map

In [125]:
state_rec.head()

Unnamed: 0,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,STATE_NAME
0,40.754666,-73.991382,40.742388,-73.997262,start_station_latitude
1,40.719392,-74.002472,40.728419,-73.98714,end_station_latitude
2,40.760193,-73.991255,40.768254,-73.988639,end_station_latitude
3,40.743156,-73.974347,40.756014,-73.967416,end_station_latitude
4,40.75045,-73.994811,40.743943,-73.979661,start_station_latitude


In [131]:
# Setup a folium map at a high-level zoom
map = folium.Map(location=[100, 0], zoom_start=1.5)

# Choropleth maps bind Pandas DataFrames and JSON geometries
folium.Choropleth(
    geo_data=states_geo, 
    data=state_rec,
    columns=['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude', 'STATE_NAME'],
    key_on='feature.id',  # Correct way to specify the key reference
    fill_color='YlOrBr', 
    fill_opacity=0.6, 
    line_opacity=0.1,
    legend_name="STATE_NAME"
).add_to(map)

folium.LayerControl().add_to(map)

map

PermissionError: [Errno 1] Operation not permitted: '/Users/peterguan/Desktop/CareerFoundry/Data Immersion Achievement 6/JSON file U.S. States.json'