In [1]:
# import dependencies
import datetime as dt
import pandas as pd
import numpy as np
import requests, io, zipfile, os, time
import dask.dataframe as dd

In [2]:
# read in full csv and scale for analysis

# for curiosity, time taken to read data
## dask documentation from https://www.geeksforgeeks.org/working-with-large-csv-files-in-python/
s_time = time.time()

# Read CSV files from List
ddf = dd.read_csv("full_raw_data.csv",
            low_memory=False,
            dtype={
                    'end_station_id': 'object',
                    'start_station_id': 'object'
                  },
                 )

# end of data read, print time report
e_time = time.time()
print("Processed in ", ((e_time-s_time)/60), "minutes")

ddf.dtypes

Processed in  0.00026700496673583987 minutes


Unnamed: 0              int64
ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [3]:
# dask compute the DataFrame values
##  to create the pandas DataFrame

s_time = time.time()
df = ddf.compute()
e_time = time.time()
print("Processed in ", ((e_time-s_time)/60), "minutes")

Processed in  1.5661867380142211 minutes


In [4]:
# # create pandas DataFrame

# s_time = time.time()
# df = pd.DataFrame(ddf)
# e_time = time.time()
# print("Processed in ", ((e_time-s_time)/60), "minutes")

In [5]:
print(f'df contains {df.shape[0]} rows and {df.shape[1]} columns.')
print('~' * 50)
df.dtypes

df contains 57256026 rows and 14 columns.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Unnamed: 0              int64
ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [6]:
# set up date formats and rename columns

##  start date/time
df["started_at"] = df["started_at"].astype('datetime64[ns]')
df['ride_year'] = df['started_at'].dt.year
df['ride_month'] = df['started_at'].dt.month_name
df['ride_weekday'] = df['started_at'].dt.day_name
df['ride_hour'] = df['started_at'].dt.hour


##  end date/time
df["ended_at"] = df["ended_at"].astype('datetime64[ns]')

## add column for ride time
df['ride_minutes'] = df["ended_at"] - df["started_at"]

# rename columns
df = df.rename(columns={
                        'start_station_name' : 'start_station',
                        'end_station_name': 'end_station',
                        'rideable_type': 'ride_type',
                        'member_casual' : 'member_type'
                        })

In [7]:
df.dtypes

Unnamed: 0                    int64
ride_id                      object
ride_type                    object
started_at           datetime64[ns]
ended_at             datetime64[ns]
start_station                object
start_station_id             object
end_station                  object
end_station_id               object
start_lat                   float64
start_lng                   float64
end_lat                     float64
end_lng                     float64
member_type                  object
ride_year                     int64
ride_month                   object
ride_weekday                 object
ride_hour                     int64
ride_minutes        timedelta64[ns]
dtype: object

In [8]:
# select columns for final csv file

df_cleaned = pd.DataFrame(df, columns={
                                    'start_station',    
                                    'end_station',
                                    'ride_year',
                                    'ride_month',
                                    'ride_weekday',
                                    'ride_hour',
                                    'ride_minutes',
                                    'ride_type',
                                    'member_type'    
                                    })

In [11]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57256026 entries, 0 to 172027
Data columns (total 9 columns):
 #   Column         Dtype          
---  ------         -----          
 0   ride_month     object         
 1   ride_type      object         
 2   ride_weekday   object         
 3   ride_minutes   timedelta64[ns]
 4   ride_hour      int64          
 5   end_station    object         
 6   ride_year      int64          
 7   member_type    object         
 8   start_station  object         
dtypes: int64(2), object(6), timedelta64[ns](1)
memory usage: 4.3+ GB


In [None]:
# send  cleaned df to csv file
path = f'clean_data.csv'
df_cleaned.to_csv(path)