# Data Cleaning Process 

Source file: *cyclistic_data_cleaning.py*


##### Loading pandas library:

In [45]:
import pandas as pd

##### Importing raw data:

In [46]:
okt2021 = pd.read_csv('workfiles/202110-divvy-tripdata.csv', index_col=0)
nov2021 = pd.read_csv('workfiles/202111-divvy-tripdata.csv', index_col=0)
dec2021 = pd.read_csv('workfiles/202112-divvy-tripdata.csv', index_col=0)
jan2022 = pd.read_csv('workfiles/202201-divvy-tripdata.csv', index_col=0)
feb2022 = pd.read_csv('workfiles/202202-divvy-tripdata.csv', index_col=0)
mar2022 = pd.read_csv('workfiles/202203-divvy-tripdata.csv', index_col=0)
apr2022 = pd.read_csv('workfiles/202204-divvy-tripdata.csv', index_col=0)
may2022 = pd.read_csv('workfiles/202205-divvy-tripdata.csv', index_col=0)
jun2022 = pd.read_csv('workfiles/202206-divvy-tripdata.csv', index_col=0)
jul2022 = pd.read_csv('workfiles/202207-divvy-tripdata.csv', index_col=0)
aug2022 = pd.read_csv('workfiles/202208-divvy-tripdata.csv', index_col=0)
sep2022 = pd.read_csv('workfiles/202209-divvy-tripdata.csv', index_col=0)

##### Merging all datasets:

In [47]:
# Merging all datasets into one:
dataframe = [okt2021, nov2021, dec2021, jan2022, feb2022,
             mar2022, apr2022, may2022, jun2022, jul2022, aug2022, sep2022]
df_complete = pd.concat(dataframe)

##### Checking the structure of all dataset files:
All tables should have the same structure to complete the merging step successfully. Check
showed that all 12 datasets have a countable number of rows and 12 columns with the same
index names.

In [None]:
# Checking number and columns and its names in datasets:
for _ in dataframe:
    print(_.shape)
    print(_.columns)
# Raw datasets contain 12 same columns. Tables are consistent.

##### Checking the structure of the merged dataset:
Merged table keeps the structure of the single raw dataset. It has 5.828.235 rows and 12
columns.

In [49]:
df_complete.shape

(5828235, 12)

In [50]:
df_complete.columns

Index(['rideable_type', 'started_at', 'ended_at', 'start_station_name',
       'start_station_id', 'end_station_name', 'end_station_id', 'start_lat',
       'start_lng', 'end_lat', 'end_lng', 'member_casual'],
      dtype='object')

##### Pushing “ride_id” index at the beginning of the table:

In [51]:
df_complete = df_complete.reset_index(level=0)

##### Checking data types:

In [52]:
df_complete.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

##### Changing data types to datetype format:

In [53]:
# Columns "started_at" and "ended_at" should be in datetype format:
df_complete['started_at'] = pd.to_datetime(df_complete['started_at'])
# Checking first cell:
df_complete['started_at'][0]

Timestamp('2021-10-22 12:46:42')

In [54]:
df_complete['ended_at'] = pd.to_datetime(df_complete['ended_at'])
# Checking first cell:
df_complete['ended_at'][0]

Timestamp('2021-10-22 12:49:50')

##### Looking for “typos” or “misspellings” in the whole set:

In [55]:
df_complete.rideable_type.unique()
# There are 3 types of bikes: "electric bike", "docked bike" and "classic bike". No errors.

array(['electric_bike', 'docked_bike', 'classic_bike'], dtype=object)

In [56]:
df_complete.member_casual.unique()
# There are 2 types of users: "casual", "member". No errors.

array(['member', 'casual'], dtype=object)

In [57]:
df_complete.start_station_id.value_counts()

13022           75985
13300           42035
LF-005          40592
13042           40119
TA1308000050    39352
                ...  
945                 1
925                 1
868                 1
886                 1
959                 1
Name: start_station_id, Length: 1302, dtype: int64

In [58]:
df_complete.start_station_name.value_counts()

Streeter Dr & Grand Ave                  75985
DuSable Lake Shore Dr & Monroe St        42035
DuSable Lake Shore Dr & North Blvd       40592
Michigan Ave & Oak St                    40119
Wells St & Concord Ln                    39352
                                         ...  
Public Rack - Chappel Ave & 71st St          1
Public Rack - Maplewood Ave & 47th St        1
Lawndale Ave & Polk St                       1
Public Rack - Halsted St & 63rd St           1
Troy & 111th St                              1
Name: start_station_name, Length: 1591, dtype: int64

In [59]:
df_complete.end_station_id.value_counts()

13022           76510
LF-005          42621
13042           40643
13300           40633
TA1308000050    39196
                ...  
973                 1
978                 1
959                 1
924                 1
1063                1
Name: end_station_id, Length: 1309, dtype: int64

In [60]:
df_complete.end_station_name.value_counts()
# There are many names and id's that occur just once. 
# They could be error inputs or new/deleted stations. To further investigation

Streeter Dr & Grand Ave                 76510
DuSable Lake Shore Dr & North Blvd      42621
Michigan Ave & Oak St                   40643
DuSable Lake Shore Dr & Monroe St       40633
Wells St & Concord Ln                   39196
                                        ...  
Public Rack - Homan Ave & Roosevelt         1
Public Rack - East End Ave & 75th St        1
Public Rack - 53rd St & Indiana Ave         1
Public Rack - Yates Ave & 100th St          1
Public Rack - Kedzie & 103rd St - W         1
Name: end_station_name, Length: 1609, dtype: int64

No such errors were found.
There are many station names and id's that occur just once. They could be error inputs or
new/deleted stations. This problem has been left at this point to eventual further
investigation.

#### Checking length of the ride id’s and looking for its duplicates:

In [61]:
# Checking length of the id's:
df_complete['ride_id'].map(len).unique()
# All id's have 16 characters

array([16], dtype=int64)

In [62]:
# Checking for duplicate inputs based on ride id:
df_complete.duplicated(subset=['ride_id']).value_counts()
# There are no duplicates of ride ids

False    5828235
dtype: int64

#### Checking for empty cells, null values in single columns:

In [63]:
# Checking for empty cells, null values:
for column in df_complete.columns:
    print(df_complete[column].isnull().value_counts())

# 895032 rows with empty cells or null values in start station description columns
# start_station_id inconsistent length and format

# 958227 rows with empty cells or null values in start station description columns
# end_station_id inconsistent length and format

# 821264 rows where both start and end station is not given

# 5844 rows with empty cells or null values in end coordinates columns

False    5828235
Name: ride_id, dtype: int64
False    5828235
Name: rideable_type, dtype: int64
False    5828235
Name: started_at, dtype: int64
False    5828235
Name: ended_at, dtype: int64
False    4933203
True      895032
Name: start_station_name, dtype: int64
False    4933203
True      895032
Name: start_station_id, dtype: int64
False    4870008
True      958227
Name: end_station_name, dtype: int64
False    4870008
True      958227
Name: end_station_id, dtype: int64
False    5828235
Name: start_lat, dtype: int64
False    5828235
Name: start_lng, dtype: int64
False    5822391
True        5844
Name: end_lat, dtype: int64
False    5822391
True        5844
Name: end_lng, dtype: int64
False    5828235
Name: member_casual, dtype: int64


There were few inputs without recorded station description (names or id’s) or coordinates.

#### Removing incomplete instances and checking the number of deleted rows:

In [64]:
# Deleting rows with missing data:
no_nan_data = df_complete.dropna()
no_nan_data.shape

(4474141, 13)

The dataset without NAN values contains 4474141 rows which is 76% of the whole set.

#### Looking for unwanted data – test data:
There were few instances with “TEST” in start and end station id’s columns. All of them should not be considered. After deletion there were 4472680 rows left (about 75% of the whole
dataset).

In [65]:
# Searching for the unwanted data - test data.
# Context: TEST found in one of the start stations id's "Hubbard Bike-checking (LBS-WH-TEST)"
no_nan_data['start_station_id'].str.contains('TEST').value_counts()
# 1207 test rides found in start station id's. All should not be considered.

False    4472934
True        1207
Name: start_station_id, dtype: int64

In [66]:
no_nan_data['start_station_name'].str.contains('TEST').value_counts()
# No test rides found in start station names.

False    4474141
Name: start_station_name, dtype: int64

In [67]:
# Deleting test rides:
no_test_start_data = no_nan_data[no_nan_data['start_station_id'].str.contains(
    'TEST') != True]

In [68]:
no_test_start_data['end_station_id'].str.contains('TEST').value_counts()
# 254 test rides found in rest end station id's. All should not be considered.

False    4472680
True         254
Name: end_station_id, dtype: int64

In [69]:
no_test_start_data['end_station_name'].str.contains('TEST').value_counts()
# No test rides found in end station names.

False    4472934
Name: end_station_name, dtype: int64

In [70]:
# Deleting test rides:
no_nan_no_test_data = no_test_start_data[no_test_start_data['end_station_id'].str.contains(
    'TEST') != True]
# 4472680 rows left

## Preparing the data:
#### Adding new column to calculate ride time:

In [71]:
# Dataframe with no test rides and no NaN values:
no_nan_no_test_data.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
25,614B15BC42810184,docked_bike,2021-10-05 10:56:05,2021-10-05 11:38:48,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
69,ADCC6E3CF9C04688,classic_bike,2021-10-06 13:55:33,2021-10-06 13:58:16,Desplaines St & Kinzie St,TA1306000003,Kingsbury St & Kinzie St,KA1503000043,41.888716,-87.644448,41.889177,-87.638506,member
76,6184CC57243AEF3C,docked_bike,2021-10-16 10:19:43,2021-10-16 12:01:20,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
84,DE02D027BAC5C820,docked_bike,2021-10-24 11:03:34,2021-10-24 13:10:01,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
94,E7C9BADDF2308D0D,classic_bike,2021-10-23 23:33:22,2021-10-23 23:35:27,Kingsbury St & Kinzie St,KA1503000043,Desplaines St & Kinzie St,TA1306000003,41.889177,-87.638506,41.888716,-87.644448,member


In [72]:
# Creating copy of the dataframe to add new column:
df_ridetime = no_nan_no_test_data.copy(deep=True)

# Adding new column with ride time:
df_ridetime.insert(
    loc=4, column='ride_time[s]', value=df_ridetime['ended_at'] - df_ridetime['started_at'])
# Changing ride time to seconds:
df_ridetime['ride_time[s]'] = df_ridetime['ride_time[s]'].astype(
    'timedelta64[s]')

df_ridetime.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,ride_time[s],start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
25,614B15BC42810184,docked_bike,2021-10-05 10:56:05,2021-10-05 11:38:48,2563.0,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
69,ADCC6E3CF9C04688,classic_bike,2021-10-06 13:55:33,2021-10-06 13:58:16,163.0,Desplaines St & Kinzie St,TA1306000003,Kingsbury St & Kinzie St,KA1503000043,41.888716,-87.644448,41.889177,-87.638506,member
76,6184CC57243AEF3C,docked_bike,2021-10-16 10:19:43,2021-10-16 12:01:20,6097.0,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
84,DE02D027BAC5C820,docked_bike,2021-10-24 11:03:34,2021-10-24 13:10:01,7587.0,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,casual
94,E7C9BADDF2308D0D,classic_bike,2021-10-23 23:33:22,2021-10-23 23:35:27,125.0,Kingsbury St & Kinzie St,KA1503000043,Desplaines St & Kinzie St,TA1306000003,41.889177,-87.638506,41.888716,-87.644448,member


All time values were calculated into seconds for further analysis.

#### Checking for negative and irrelevant ride times:
Negative ride time value means an input error which should not be considered. For the
analysis purpose also the ride time below 60 seconds won’t be taken into account.

In [73]:
# Sorting data by ride time:
df_rt_sorted = df_ridetime.sort_values(by='ride_time[s]')

df_rt_sorted.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,ride_time[s],start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
2874804,072E947E156D142D,electric_bike,2022-06-07 19:14:46,2022-06-07 17:07:45,-7621.0,W Armitage Ave & N Sheffield Ave,20254.0,W Armitage Ave & N Sheffield Ave,20254.0,41.92,-87.65,41.92,-87.65,casual
773317,FD8AF7324ABAE9DA,electric_bike,2021-11-07 01:56:51,2021-11-07 01:00:57,-3354.0,Clark St & North Ave,13128,Larrabee St & Webster Ave,13193,41.911738,-87.632145,41.921762,-87.644034,casual
757081,508B09A5FB0737DC,classic_bike,2021-11-07 01:54:50,2021-11-07 01:00:45,-3245.0,Sedgwick St & Webster Ave,13191,Sedgwick St & North Ave,TA1307000038,41.922167,-87.638888,41.911386,-87.638677,member
892046,6F9E76F5EDAAC1B8,electric_bike,2021-11-07 01:55:42,2021-11-07 01:01:55,-3227.0,Milwaukee Ave & Wabansia Ave,13243,Western Ave & Division St,13241,41.91258,-87.681424,41.902906,-87.687367,member
913899,7AECC76D1562B51C,classic_bike,2021-11-07 01:54:58,2021-11-07 01:01:29,-3209.0,Sheffield Ave & Wrightwood Ave,TA1309000023,Southport Ave & Wellington Ave,TA1307000006,41.928712,-87.653833,41.935733,-87.663576,casual


In [74]:
# Checking for negative ride time values:
df_rt_sorted[df_rt_sorted['ride_time[s]'] < 0]
df_rt_sorted[df_rt_sorted['started_at'] > df_rt_sorted['ended_at']]

# 71 wrong inputs with negative ride times. Deleting:
df_rt_no_neg = df_rt_sorted[df_rt_sorted['ride_time[s]'] > 0]

df_rt_no_neg.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,ride_time[s],start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
3375090,518842A851B2AB09,classic_bike,2022-06-14 22:50:40,2022-06-14 22:50:41,1.0,Aberdeen St & Randolph St,18062,Aberdeen St & Randolph St,18062,41.884114,-87.654264,41.884114,-87.654264,casual
4072471,16F2CC8569385923,classic_bike,2022-07-28 06:51:28,2022-07-28 06:51:29,1.0,Michigan Ave & 18th St,13150,Michigan Ave & 18th St,13150,41.857813,-87.62455,41.857813,-87.62455,casual
999601,D70F197C19431615,classic_bike,2021-12-08 10:32:53,2021-12-08 10:32:54,1.0,Ellis Ave & 60th St,KA1503000014,Ellis Ave & 60th St,KA1503000014,41.785097,-87.601073,41.785097,-87.601073,member
5610384,9A85764878A36E0D,electric_bike,2022-09-09 18:12:20,2022-09-09 18:12:21,1.0,Halsted St & Roosevelt Rd,TA1305000017,Halsted St & Roosevelt Rd,TA1305000017,41.867478,-87.648629,41.867324,-87.648625,member
4925780,B388D7C8C289BCD1,classic_bike,2022-08-03 18:06:42,2022-08-03 18:06:43,1.0,DuSable Lake Shore Dr & North Blvd,LF-005,DuSable Lake Shore Dr & North Blvd,LF-005,41.911722,-87.626804,41.911722,-87.626804,casual


In [75]:
# Checking for the low ride time values, assuming these are incorrect or irrelevant records (below 60s):
df_rt_no_neg[df_rt_no_neg['ride_time[s]'] < 60]
# There are 73439 inputs with ride time below 60s. Deleting:
df_rt_cleaned = df_rt_sorted[df_rt_sorted['ride_time[s]'] >= 60]
df_rt_cleaned.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,ride_time[s],start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
285875,AC270A9376DB67C9,classic_bike,2021-10-10 19:56:44,2021-10-10 19:57:44,60.0,Western Ave & Congress Pkwy,15668,Western Ave & Congress Pkwy,15668,41.874749,-87.686445,41.874749,-87.686445,member
570739,F3E6BBE3606C65C9,classic_bike,2021-10-31 22:14:32,2021-10-31 22:15:32,60.0,LaSalle St & Jackson Blvd,TA1309000004,Dearborn St & Adams St,TA1305000005,41.878166,-87.631929,41.879356,-87.629791,member
2667569,33C3E85CCE041276,classic_bike,2022-05-13 03:44:15,2022-05-13 03:45:15,60.0,Michigan Ave & Oak St,13042,Michigan Ave & Oak St,13042,41.90096,-87.623777,41.90096,-87.623777,member
5156557,0F44DD9C4BFC3E16,electric_bike,2022-09-12 15:12:06,2022-09-12 15:13:06,60.0,Western Ave & Roscoe St,15634,Western Ave & Roscoe St,15634,41.943056,-87.687301,41.943034,-87.687288,member
392319,D083FC64B85F2234,classic_bike,2021-10-13 11:36:32,2021-10-13 11:37:32,60.0,Orleans St & Hubbard St,636,Wells St & Hubbard St,TA1307000151,41.890028,-87.636618,41.889906,-87.634266,member


In [76]:
# Data sorted by date:
df_sort_date = df_rt_cleaned.sort_values(by="started_at")
df_sort_date.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,ride_time[s],start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
271143,ABA2BDC3595FC3E9,classic_bike,2021-10-01 00:00:09,2021-10-01 00:10:12,603.0,Morgan St & Lake St,TA1306000015,Noble St & Milwaukee Ave,13290,41.885483,-87.652305,41.90068,-87.6626,casual
117528,0BE9C131A5705D92,classic_bike,2021-10-01 00:00:16,2021-10-01 00:05:29,313.0,Damen Ave & Cortland St,13133,Winchester Ave & Elston Ave,KA1504000140,41.915983,-87.677335,41.924091,-87.67646,casual
341788,74483AC18C8C6B90,classic_bike,2021-10-01 00:00:18,2021-10-01 00:08:52,514.0,Halsted St & Roscoe St,TA1309000025,Greenview Ave & Diversey Pkwy,13294,41.94367,-87.64895,41.93259,-87.665936,casual
200504,F8DBF095F01E1B68,classic_bike,2021-10-01 00:02:22,2021-10-01 00:06:40,258.0,Pine Grove Ave & Irving Park Rd,TA1308000022,Clarendon Ave & Junior Ter,13389,41.954383,-87.648043,41.961004,-87.649603,member
21249,B0B9EB7622461EF4,classic_bike,2021-10-01 00:02:27,2021-10-01 00:17:56,929.0,MLK Jr Dr & 29th St,TA1307000139,Clinton St & Roosevelt Rd,WL-008,41.842052,-87.617,41.867118,-87.641088,member


#### Deleting irrelevant columns such as ride id’s, station id’s and station coordinates:

In [77]:
# For the analysis purposes ride id's, station id's and coordinates are irrelevant.
df_sort_date.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at', 'ride_time[s]',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [78]:
columns_to_be_dropped = ['ride_id', 'start_station_id',
                         'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng']
df_drop_columns = df_sort_date.drop(columns=columns_to_be_dropped)

df_drop_columns.head(5)

Unnamed: 0,rideable_type,started_at,ended_at,ride_time[s],start_station_name,end_station_name,member_casual
271143,classic_bike,2021-10-01 00:00:09,2021-10-01 00:10:12,603.0,Morgan St & Lake St,Noble St & Milwaukee Ave,casual
117528,classic_bike,2021-10-01 00:00:16,2021-10-01 00:05:29,313.0,Damen Ave & Cortland St,Winchester Ave & Elston Ave,casual
341788,classic_bike,2021-10-01 00:00:18,2021-10-01 00:08:52,514.0,Halsted St & Roscoe St,Greenview Ave & Diversey Pkwy,casual
200504,classic_bike,2021-10-01 00:02:22,2021-10-01 00:06:40,258.0,Pine Grove Ave & Irving Park Rd,Clarendon Ave & Junior Ter,member
21249,classic_bike,2021-10-01 00:02:27,2021-10-01 00:17:56,929.0,MLK Jr Dr & 29th St,Clinton St & Roosevelt Rd,member


#### Adding columns with the day of the week for each ride (0 – Monday, 6 – Sunday):

In [79]:
## Adding columns with the day of the week that each ride started
# 0 - Monday, 6 - Sunday:
df_drop_columns.insert(loc=4, column='weekday', value=df_ridetime['started_at'].dt.weekday)

all_rides = df_drop_columns.copy(deep=True)
all_rides.head(5)

Unnamed: 0,rideable_type,started_at,ended_at,ride_time[s],weekday,start_station_name,end_station_name,member_casual
271143,classic_bike,2021-10-01 00:00:09,2021-10-01 00:10:12,603.0,4,Morgan St & Lake St,Noble St & Milwaukee Ave,casual
117528,classic_bike,2021-10-01 00:00:16,2021-10-01 00:05:29,313.0,4,Damen Ave & Cortland St,Winchester Ave & Elston Ave,casual
341788,classic_bike,2021-10-01 00:00:18,2021-10-01 00:08:52,514.0,4,Halsted St & Roscoe St,Greenview Ave & Diversey Pkwy,casual
200504,classic_bike,2021-10-01 00:02:22,2021-10-01 00:06:40,258.0,4,Pine Grove Ave & Irving Park Rd,Clarendon Ave & Junior Ter,member
21249,classic_bike,2021-10-01 00:02:27,2021-10-01 00:17:56,929.0,4,MLK Jr Dr & 29th St,Clinton St & Roosevelt Rd,member


#### Creating new column names for cleaned dataset:

In [80]:
new_columns = {'rideable_type': 'BikeType',
               'started_at': 'RideStart',
               'ended_at': 'RideEnd',
               'ride_time[s]': 'RideTime[s]',
               'weekday': 'Weekday',
               'start_station_name': 'StartStation',
               'end_station_name': 'EndStation',
               'member_casual': 'UserType'
              }

all_rides = all_rides.rename(columns=new_columns)

all_rides.head(5)

Unnamed: 0,BikeType,RideStart,RideEnd,RideTime[s],Weekday,StartStation,EndStation,UserType
271143,classic_bike,2021-10-01 00:00:09,2021-10-01 00:10:12,603.0,4,Morgan St & Lake St,Noble St & Milwaukee Ave,casual
117528,classic_bike,2021-10-01 00:00:16,2021-10-01 00:05:29,313.0,4,Damen Ave & Cortland St,Winchester Ave & Elston Ave,casual
341788,classic_bike,2021-10-01 00:00:18,2021-10-01 00:08:52,514.0,4,Halsted St & Roscoe St,Greenview Ave & Diversey Pkwy,casual
200504,classic_bike,2021-10-01 00:02:22,2021-10-01 00:06:40,258.0,4,Pine Grove Ave & Irving Park Rd,Clarendon Ave & Junior Ter,member
21249,classic_bike,2021-10-01 00:02:27,2021-10-01 00:17:56,929.0,4,MLK Jr Dr & 29th St,Clinton St & Roosevelt Rd,member


#### Exporting clean dataset, ready for analysis:

In [81]:
all_rides.to_csv('cleaned_data/cyclistic_202110-202209_cleaned.csv', index=False)