## STEP 1: COLLECT DATA

### Importing packages and CSV files into corresponding dataframes

In [3]:
import pandas as pd

In [4]:
q2_2019 = pd.read_csv('Divvy_Trips_2019_Q2.csv')
q3_2019 = pd.read_csv("Divvy_Trips_2019_Q3.csv")
q4_2019 = pd.read_csv("Divvy_Trips_2019_Q4.csv")
q1_2020 = pd.read_csv("Divvy_Trips_2020_Q1.csv")

## STEP 2: PREPARE

### Rename columns to create uniformity and for merging later on

In [33]:
q4_2019.rename(columns={'trip_id':'ride_id',
                        'bikeid':'rideable_type',
                        'start_time':'started_at',
                        'end_time':'ended_at',
                        'from_station_name':'start_station_name',
                        'from_station_id':'start_station_id',
                        'to_station_name':'end_station_name',
                        'to_station_id':'end_station_id',
                        'usertype':'member_casual'}, inplace=True)

In [36]:
q3_2019.rename(columns={'trip_id':'ride_id',
                        'bikeid':'rideable_type',
                        'start_time':'started_at',
                        'end_time':'ended_at',
                        'from_station_name':'start_station_name',
                        'from_station_id':'start_station_id',
                        'to_station_name':'end_station_name',
                        'to_station_id':'end_station_id',
                        'usertype':'member_casual'}, inplace=True)

In [76]:
q2_2019.rename(columns={'01 - Rental Details Rental ID':'ride_id',
                        '01 - Rental Details Bike ID':'rideable_type',
                        '01 - Rental Details Local Start Time':'started_at',
                        '01 - Rental Details Local End Time':'ended_at',
                        '03 - Rental Start Station Name':'start_station_name',
                        '03 - Rental Start Station ID':'start_station_id',
                        '02 - Rental End Station Name':'end_station_name',
                        '02 - Rental End Station ID':'end_station_id',
                        'User Type':'member_casual'}, inplace=True)

### Convert datatypes

In [77]:
#convert ride_id and rideable_type to character so they are uniform when stacking
q4_2019['ride_id'] = q4_2019['ride_id'].astype(str)
q4_2019['rideable_type'] = q4_2019['rideable_type'].astype(str)

In [78]:
q3_2019['ride_id'] = q3_2019['ride_id'].astype(str)
q3_2019['rideable_type'] = q3_2019['rideable_type'].astype(str)

In [79]:
q2_2019['ride_id'] = q2_2019['ride_id'].astype(str)
q2_2019['rideable_type'] = q2_2019['rideable_type'].astype(str)

### Combining 4 smaller datasets into one big data frame

In [80]:
alltrips = pd.concat([q2_2019,q3_2019,q4_2019,q1_2020],ignore_index=True)

### Removing unwanted columns (some data collecting was dropped in 2020)


In [81]:
alltrips = alltrips.drop(columns=['01 - Rental Details Duration In Seconds Uncapped','05 - Member Details Member Birthday Year','gender','birthyear','start_lat','start_lng','end_lat','end_lng','Member Gender','tripduration'])

## STEP 2: PROCESS 
### clean up and data manipulation

In [86]:
# in 'member_casual' column, Subscriber=Member and Customer=casual (so lets replacek them to keep only 2 instead of 4)

alltrips['member_casual'] = alltrips['member_casual'].replace('Subscriber','member')
alltrips['member_casual'] = alltrips['member_casual'].replace('Customer','casual')

In [95]:
#all three new columns: convert the 'started_at' column to date datatype 'Date' and extract --> day, month, day of week

alltrips['date'] = pd.to_datetime(alltrips['started_at'])

alltrips['day'] = alltrips['date'].dt.date
alltrips['month'] = alltrips['date'].dt.month
alltrips['year'] = alltrips['date'].dt.year
alltrips['day_of_week'] = alltrips['date'].dt.day_name()

In [120]:
#add a 'ride_length' column in alltrips that calculates ride length (in seconds) of each rides using 'started_at' and 'ended_at'
 
start_time = pd.to_datetime(alltrips['started_at'])
end_time = pd.to_datetime(alltrips['ended_at'])
alltrips['ride_length'] = (end_time - start_time).dt.total_seconds()

In [131]:
#Convert "ride_length" to numeric so we can run calculations on the data

alltrips['ride_length'] = alltrips['ride_length'].astype(int)

In [145]:
#some entries in 'ride_length' contains negative values, and some start station names contains "HQ QR" values which means these rows have to be removed as this case is when the bikes are taken out of docks and checked for quality

conditions = (alltrips['ride_length'] < 0) | (alltrips['start_station_name'] == "HQ_QR")
alltrips = alltrips.drop(alltrips[conditions].index)

## STEP 3: ANALYZE 
