### Data import

Purpose of this notebook is to import and consolidate the monthly, processed CSV files and add in neighborhood

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [2]:
# Read in each CSV and append to the dataframe list

df_list = []
month = 6
year = 2013

for i in range(101):
    
    if month > 12:
        month = 1
        year += 1
    
    if month >= 10:
        df = pd.read_csv(f"./processed/{year}{month}.csv", index_col=0)
        df_list.append(df)

    else:
        df = pd.read_csv(f"./processed/{year}0{month}.csv", index_col=0)
        df_list.append(df)
    
    month += 1

In [3]:
# Concat list of dataframes together

df_final = pd.concat(df_list)

In [4]:
# Make sure index is a datetime variable

df_final.index = pd.to_datetime(df_final.index)
df_final['year'] = df_final.index.map(lambda x: x.year)

In [5]:
# Rename columns

df_final = df_final.rename(columns={'start station id': 'station_id', 'start station latitude': 'lat', 
                                    'start station longitude': 'long'})

In [6]:
df_final.head()

Unnamed: 0_level_0,station_id,lat,long,ride_count,year
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-06-01,72,40.767272,-73.993929,40,2013
2013-06-01,79,40.719116,-74.006667,61,2013
2013-06-01,82,40.711174,-74.000165,6,2013
2013-06-01,83,40.683826,-73.976323,32,2013
2013-06-01,116,40.741776,-74.001497,53,2013


In [7]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2144526 entries, 2013-06-01 to 2021-10-31
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   station_id  object 
 1   lat         float64
 2   long        float64
 3   ride_count  int64  
 4   year        int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 98.2+ MB


### Add in neighborhood

- This will leverage Geopy's reverse geocoder functionality
- Based on lat / long, the location object will include the NYC neighborhood

In [8]:
# Create geolocator object

geolocator = Nominatim(user_agent='marvel_app')

In [17]:
# Create function to return lat / long

def get_neighborhood(lat, long):
    loc_string = str(lat) + ', ' + str(long)
    location = geolocator.reverse(loc_string)
    try:
        neighborhood = location.raw['address']['neighbourhood']
    except KeyError:
        neighborhood = np.nan
    
    return neighborhood

In [None]:
# Apply neighborhood to each station

df_final['neighborhood'] = df_final.apply(lambda row: get_neighborhood(row['lat'], row['long']), axis=1)