### This script contains the below:
1. Import libraries
2. Importing bike trips data to merge it
3. Get weather data using NOAA's API
4. Merge bike trip and weather data

# 01 Importing libraries

In [113]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

# 02 Importing data

In [114]:
# Create a list with all files in the folder using a list comprehension

folderpath = r"../02 Data Chicago" 
filepath = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [115]:
os.getcwd()

'/Users/woodoooo/New-York-s-CitiBike-trips-in-2022/03 Scripts'

In [116]:
filepath

['../02 Data Chicago/Divvy_Trips_2018_Q1.csv',
 '../02 Data Chicago/Divvy_Trips_2018_Q2.csv',
 '../02 Data Chicago/Divvy_Trips_2018_Q3.csv',
 '../02 Data Chicago/Divvy_Trips_2018_Q4.csv']

In [117]:
# merging the data
df = pd.concat((pd.read_csv(f) for f in filepath), ignore_index = True)

In [118]:
df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,17536702,2018-01-01 00:12:00,2018-01-01 00:17:23,3304,323.0,69,Damen Ave & Pierce Ave,159,Claremont Ave & Hirsch St,Subscriber,Male,1988.0
1,17536703,2018-01-01 00:41:35,2018-01-01 00:47:52,5367,377.0,253,Winthrop Ave & Lawrence Ave,325,Clark St & Winnemac Ave (Temp),Subscriber,Male,1984.0
2,17536704,2018-01-01 00:44:46,2018-01-01 01:33:10,4599,2904.0,98,LaSalle St & Washington St,509,Troy St & North Ave,Subscriber,Male,1989.0
3,17536705,2018-01-01 00:53:10,2018-01-01 01:05:37,2302,747.0,125,Rush St & Hubbard St,364,Larrabee St & Oak St,Subscriber,Male,1983.0
4,17536706,2018-01-01 00:53:37,2018-01-01 00:56:40,3696,183.0,129,Blue Island Ave & 18th St,205,Paulina St & 18th St,Subscriber,Male,1989.0


In [119]:
df.shape

(3603082, 12)

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603082 entries, 0 to 3603081
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   trip_id            int64  
 1   start_time         object 
 2   end_time           object 
 3   bikeid             int64  
 4   tripduration       object 
 5   from_station_id    int64  
 6   from_station_name  object 
 7   to_station_id      int64  
 8   to_station_name    object 
 9   usertype           object 
 10  gender             object 
 11  birthyear          float64
dtypes: float64(1), int64(4), object(7)
memory usage: 329.9+ MB


# 03 Gathering Weather Data

In [121]:
# Define your NOAA token

Token = 'HLPKfjkSRPFNvFLMQBdRxcFpoojZoEwc'

In [122]:
# URL for O’Hare
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00094846&startdate=2018-01-01&enddate=2018-12-31"

r = requests.get(url, headers={'token': Token})

In [123]:
# Store the Data in JSON Format
d = json.loads(r.text)  

In [124]:
d.keys()

dict_keys(['metadata', 'results'])

In [125]:
d['results']

[{'date': '2018-01-01T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -189},
 {'date': '2018-01-02T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -191},
 {'date': '2018-01-03T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -114},
 {'date': '2018-01-04T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -139},
 {'date': '2018-01-05T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -152},
 {'date': '2018-01-06T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -148},
 {'date': '2018-01-07T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:USW00094846',
  'attributes': 'H,,S,',
  'value': -84},
 {'date': '2018-01-08T00:00:00',
  'datatype': 'TAVG',
  'station': 'GHCND:US

In [126]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [127]:
df_temp = pd.DataFrame()

In [128]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [129]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2018-01-01,-18.9
1,2018-01-02,-19.1
2,2018-01-03,-11.4
3,2018-01-04,-13.9
4,2018-01-05,-15.2


In [143]:
df_temp.to_csv('chicago_weather.cvs')

# 04 Merge bike trip and weather data

In [130]:
df.dtypes

trip_id                int64
start_time            object
end_time              object
bikeid                 int64
tripduration          object
from_station_id        int64
from_station_name     object
to_station_id          int64
to_station_name       object
usertype              object
gender                object
birthyear            float64
dtype: object

In [131]:
df_temp.dtypes

date       datetime64[ns]
avgTemp           float64
dtype: object

In [132]:
# Convert 'date' from string to datetime

df['start_time'] = pd.to_datetime(df['start_time'], dayfirst = True)

In [133]:
# Extract only the date part (YYYY-MM-DD) from 'started_at' into a new column

df['date'] = pd.to_datetime(df['start_time'], format='%Y-%m-%d').dt.date

In [134]:
# Convert 'date' from string to datetime

df['date'] = pd.to_datetime(df['date'])

In [135]:
df.dtypes

trip_id                       int64
start_time           datetime64[ns]
end_time                     object
bikeid                        int64
tripduration                 object
from_station_id               int64
from_station_name            object
to_station_id                 int64
to_station_name              object
usertype                     object
gender                       object
birthyear                   float64
date                 datetime64[ns]
dtype: object

In [136]:
# Merge dataframes

df_merged = df.merge(df_temp, how = 'left', on = 'date', indicator = True)

In [137]:
df_merged['_merge'].value_counts(dropna = False)

both          3603082
left_only           0
right_only          0
Name: _merge, dtype: int64

In [138]:
df_merged.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,date,avgTemp,_merge
0,17536702,2018-01-01 00:12:00,2018-01-01 00:17:23,3304,323.0,69,Damen Ave & Pierce Ave,159,Claremont Ave & Hirsch St,Subscriber,Male,1988.0,2018-01-01,-18.9,both
1,17536703,2018-01-01 00:41:35,2018-01-01 00:47:52,5367,377.0,253,Winthrop Ave & Lawrence Ave,325,Clark St & Winnemac Ave (Temp),Subscriber,Male,1984.0,2018-01-01,-18.9,both
2,17536704,2018-01-01 00:44:46,2018-01-01 01:33:10,4599,2904.0,98,LaSalle St & Washington St,509,Troy St & North Ave,Subscriber,Male,1989.0,2018-01-01,-18.9,both
3,17536705,2018-01-01 00:53:10,2018-01-01 01:05:37,2302,747.0,125,Rush St & Hubbard St,364,Larrabee St & Oak St,Subscriber,Male,1983.0,2018-01-01,-18.9,both
4,17536706,2018-01-01 00:53:37,2018-01-01 00:56:40,3696,183.0,129,Blue Island Ave & 18th St,205,Paulina St & 18th St,Subscriber,Male,1989.0,2018-01-01,-18.9,both


In [140]:
df_merged['date'].isna().sum()

0

In [142]:
df_merged.to_csv('chicago_merged.cvs')