## Data cleaning

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
import missingno as msno
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

In [2]:
# Load the csv file
csv = pd.read_csv("bicycle_sharing.csv", low_memory=False)
csv.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Year
0,A847FADBBC638E45,docked_bike,2020-04-26 17:45:14,2020-04-26 18:12:03,Eckhart Park,86,Lincoln Ave & Diversey Pkwy,152.0,41.9,-87.66,41.93,-87.66,member,2020
1,5405B80E996FF60D,docked_bike,2020-04-17 17:08:54,2020-04-17 17:17:03,Drake Ave & Fullerton Ave,503,Kosciuszko Park,499.0,41.92,-87.72,41.93,-87.72,member,2020
2,5DD24A79A4E006F4,docked_bike,2020-04-01 17:54:13,2020-04-01 18:08:36,McClurg Ct & Erie St,142,Indiana Ave & Roosevelt Rd,255.0,41.89,-87.62,41.87,-87.62,member,2020
3,2A59BBDF5CDBA725,docked_bike,2020-04-07 12:50:19,2020-04-07 13:02:31,California Ave & Division St,216,Wood St & Augusta Blvd,657.0,41.9,-87.7,41.9,-87.67,member,2020
4,27AD306C119C6158,docked_bike,2020-04-18 10:22:59,2020-04-18 11:15:54,Rush St & Hubbard St,125,Sheridan Rd & Lawrence Ave,323.0,41.89,-87.63,41.97,-87.65,casual,2020


In [3]:
csv.info(memory_usage="deep")
### Memory usage is very high

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14758322 entries, 0 to 14758321
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
 13  Year                int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 9.2 GB


### Missing data
- why is there missing data for station start and end data?
- is station id important? Probably not

In [4]:
# Percent of missing data, sorted descending order
(csv.apply(pd.isnull).sum()/csv.shape[0] * 100).sort_values(ascending=False)

end_station_id       12.18
end_station_name     12.18
start_station_id     11.33
start_station_name   11.32
end_lat               0.10
end_lng               0.10
ride_id               0.00
rideable_type         0.00
started_at            0.00
ended_at              0.00
start_lat             0.00
start_lng             0.00
member_casual         0.00
Year                  0.00
dtype: float64

In [5]:
null_data = csv[csv.isnull().any(axis=1)]

In [7]:
null_data.sample(20)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Year
2149591,9B5C45833ADE1797,electric_bike,2020-09-15 19:15:05,2020-09-15 19:15:37,,,,,41.97,-87.7,41.97,-87.7,member,2020
10806637,FED1648B5261BA34,electric_bike,2022-06-15 18:50:15,2022-06-15 18:58:41,,,,,41.97,-87.7,41.99,-87.71,member,2022
6542850,9CD07964506C5F50,electric_bike,2021-08-17 17:15:23,2021-08-17 17:38:38,Halsted St & Roscoe St,TA1309000025,,,41.94,-87.65,41.95,-87.65,casual,2021
13666675,97F2BB3D2349EDA1,electric_bike,2022-10-28 05:13:16,2022-10-28 05:20:38,Greenwood Ave & 47th St,TA1308000002,,,41.81,-87.6,41.8,-87.62,member,2022
14618564,41B85289EFC5B91F,electric_bike,2023-02-27 13:12:53,2023-02-27 13:15:44,Campbell Ave & North Ave,13257,,,41.91,-87.69,41.91,-87.7,member,2023
10829741,3720B126A888F9B2,electric_bike,2022-06-21 13:06:42,2022-06-21 13:49:39,Streeter Dr & Grand Ave,13022,,,41.89,-87.61,41.89,-87.6,casual,2022
8662767,101B2CC55EFE7C55,electric_bike,2021-12-15 20:57:49,2021-12-15 21:04:14,Clinton St & Lake St,13021,,,41.89,-87.64,41.89,-87.63,casual,2021
13030973,9519A38B0CE83A40,electric_bike,2022-09-14 19:12:18,2022-09-14 19:27:42,Halsted St & 18th St,13099,,,41.86,-87.65,41.88,-87.67,member,2022
8217527,F82FFC1E85FF924E,electric_bike,2021-11-24 05:57:03,2021-11-24 06:02:31,,,Fairbanks St & Superior St,18003,41.89,-87.62,41.9,-87.62,member,2021
7965731,9864273649FA12CD,electric_bike,2021-10-01 17:48:55,2021-10-01 18:14:48,Milwaukee Ave & Rockwell St,13242,,,41.92,-87.69,41.95,-87.66,casual,2021


### Ride IDs
- Ride ID column has a few duplicate rows

In [12]:
# Compare the length of the entire dataset with the number of unique ride IDs 
# This value will return false
# Indicates duplicate? 
len(csv["ride_id"]) == csv["ride_id"].nunique()

False

In [10]:
csv["ride_id"].nunique()

14758113

In [15]:
csv[csv.duplicated("ride_id", keep=False) == True]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Year
2753661,8758E473B457691C,docked_bike,2020-11-25 08:54:55,2020-11-25 09:00:16,Wells St & Polk St,175.0,LaSalle St & Washington St,98.0,41.87,-87.63,41.88,-87.63,member,2020
2755411,47E0C01E8F7BD830,docked_bike,2020-11-25 17:38:30,2020-11-25 18:06:03,Ellis Ave & 60th St,426.0,Lake Park Ave & 56th St,345.0,41.79,-87.60,41.79,-87.59,member,2020
2771017,4AE7C88494448250,docked_bike,2020-11-25 09:19:07,2020-11-25 09:27:26,Paulina St & 18th St,205.0,Wolcott Ave & Polk St,342.0,41.86,-87.67,41.87,-87.67,member,2020
2771185,461DB5E36ADBF190,docked_bike,2020-11-25 15:33:21,2020-11-25 15:35:07,Clinton St & Lake St,66.0,Clinton St & Lake St,66.0,41.89,-87.64,41.89,-87.64,casual,2020
2772563,203E4F607EB792FC,docked_bike,2020-11-25 19:26:30,2020-11-25 19:32:58,Clark St & Chicago Ave,337.0,Larrabee St & Kingsbury St,48.0,41.90,-87.63,41.90,-87.64,member,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3111833,6C1279812BD9D0C2,docked_bike,2020-12-15 12:08:00,2020-11-26 08:25:29,Woodlawn Ave & 55th St,TA1307000164,Blackstone Ave & Hyde Park Blvd,13398,41.80,-87.60,41.80,-87.59,casual,2020
3112006,5EE680AA8D46F8B8,docked_bike,2020-12-15 12:14:03,2020-11-25 09:23:46,Calumet Ave & 18th St,13102,Wabash Ave & 9th St,TA1309000010,41.86,-87.62,41.87,-87.63,member,2020
3112867,286E58C4B46877DE,docked_bike,2020-12-15 11:55:29,2020-11-25 16:16:51,Honore St & Division St,TA1305000034,Damen Ave & Clybourn Ave,13271,41.90,-87.67,41.93,-87.68,member,2020
3113600,7074FF42B83EC6AD,docked_bike,2020-12-15 11:50:14,2020-11-25 13:42:44,Peoria St & Jackson Blvd,13158,Clinton St & Madison St,TA1305000032,41.88,-87.65,41.88,-87.64,member,2020


In [17]:
csv[csv["ride_id"].duplicated(keep=False)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Year
2753661,8758E473B457691C,docked_bike,2020-11-25 08:54:55,2020-11-25 09:00:16,Wells St & Polk St,175.0,LaSalle St & Washington St,98.0,41.87,-87.63,41.88,-87.63,member,2020
2755411,47E0C01E8F7BD830,docked_bike,2020-11-25 17:38:30,2020-11-25 18:06:03,Ellis Ave & 60th St,426.0,Lake Park Ave & 56th St,345.0,41.79,-87.60,41.79,-87.59,member,2020
2771017,4AE7C88494448250,docked_bike,2020-11-25 09:19:07,2020-11-25 09:27:26,Paulina St & 18th St,205.0,Wolcott Ave & Polk St,342.0,41.86,-87.67,41.87,-87.67,member,2020
2771185,461DB5E36ADBF190,docked_bike,2020-11-25 15:33:21,2020-11-25 15:35:07,Clinton St & Lake St,66.0,Clinton St & Lake St,66.0,41.89,-87.64,41.89,-87.64,casual,2020
2772563,203E4F607EB792FC,docked_bike,2020-11-25 19:26:30,2020-11-25 19:32:58,Clark St & Chicago Ave,337.0,Larrabee St & Kingsbury St,48.0,41.90,-87.63,41.90,-87.64,member,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3111833,6C1279812BD9D0C2,docked_bike,2020-12-15 12:08:00,2020-11-26 08:25:29,Woodlawn Ave & 55th St,TA1307000164,Blackstone Ave & Hyde Park Blvd,13398,41.80,-87.60,41.80,-87.59,casual,2020
3112006,5EE680AA8D46F8B8,docked_bike,2020-12-15 12:14:03,2020-11-25 09:23:46,Calumet Ave & 18th St,13102,Wabash Ave & 9th St,TA1309000010,41.86,-87.62,41.87,-87.63,member,2020
3112867,286E58C4B46877DE,docked_bike,2020-12-15 11:55:29,2020-11-25 16:16:51,Honore St & Division St,TA1305000034,Damen Ave & Clybourn Ave,13271,41.90,-87.67,41.93,-87.68,member,2020
3113600,7074FF42B83EC6AD,docked_bike,2020-12-15 11:50:14,2020-11-25 13:42:44,Peoria St & Jackson Blvd,13158,Clinton St & Madison St,TA1305000032,41.88,-87.65,41.88,-87.64,member,2020


For some reason all the duplicated rows have a second "ride_id" that has a start date of *December 15, 2020* and an end date of *November 25, 2020*.
I've checked multiple "ride_id"s and it is the same way for all of them. As the "ride_id" is supposed to be a unique transaction ID I can only assume there's some kind of error in the data itself.
I am confident that the best solution is to get rid of the duplicated rows for a more accurate analysis. 

In [22]:
csv[csv["ride_id"] == "47E0C01E8F7BD830"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Year
2755411,47E0C01E8F7BD830,docked_bike,2020-11-25 17:38:30,2020-11-25 18:06:03,Ellis Ave & 60th St,426.0,Lake Park Ave & 56th St,345.0,41.79,-87.6,41.79,-87.59,member,2020
3110497,47E0C01E8F7BD830,docked_bike,2020-12-15 12:00:02,2020-11-25 18:06:03,Ellis Ave & 60th St,KA1503000014,Lake Park Ave & 56th St,TA1309000063,41.79,-87.6,41.79,-87.59,member,2020


In [24]:
# Drop duplicate entries on the "ride_id" column
csv = csv.drop_duplicates(subset="ride_id",keep="first")

### Drop unneccesary columns
Now that the duplicated rows based on the "ride_id" have been dealt with I'm going to remove the following columns:

-"ride_id": provides no relevant information

-start_station_id: station name is enough information

-end_station_id: station name is enough information

In [26]:
cols_to_drop =[
    "ride_id",
    "start_station_id",
    "end_station_id"
]

new_df = csv.drop(cols_to_drop, axis = 1)
new_df

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,Year
0,docked_bike,2020-04-26 17:45:14,2020-04-26 18:12:03,Eckhart Park,Lincoln Ave & Diversey Pkwy,41.90,-87.66,41.93,-87.66,member,2020
1,docked_bike,2020-04-17 17:08:54,2020-04-17 17:17:03,Drake Ave & Fullerton Ave,Kosciuszko Park,41.92,-87.72,41.93,-87.72,member,2020
2,docked_bike,2020-04-01 17:54:13,2020-04-01 18:08:36,McClurg Ct & Erie St,Indiana Ave & Roosevelt Rd,41.89,-87.62,41.87,-87.62,member,2020
3,docked_bike,2020-04-07 12:50:19,2020-04-07 13:02:31,California Ave & Division St,Wood St & Augusta Blvd,41.90,-87.70,41.90,-87.67,member,2020
4,docked_bike,2020-04-18 10:22:59,2020-04-18 11:15:54,Rush St & Hubbard St,Sheridan Rd & Lawrence Ave,41.89,-87.63,41.97,-87.65,casual,2020
...,...,...,...,...,...,...,...,...,...,...,...
14758317,classic_bike,2023-02-08 21:57:22,2023-02-08 22:08:06,Clark St & Wrightwood Ave,Sheffield Ave & Waveland Ave,41.93,-87.64,41.95,-87.65,member,2023
14758318,electric_bike,2023-02-19 11:29:09,2023-02-19 11:39:11,Ogden Ave & Roosevelt Rd,Delano Ct & Roosevelt Rd,41.87,-87.68,41.87,-87.63,member,2023
14758319,electric_bike,2023-02-07 09:01:33,2023-02-07 09:16:53,Clark St & Wrightwood Ave,Canal St & Madison St,41.93,-87.64,41.88,-87.64,casual,2023
14758320,electric_bike,2023-02-22 08:33:22,2023-02-22 08:50:11,Clark St & Wrightwood Ave,Canal St & Madison St,41.93,-87.64,41.88,-87.64,casual,2023


In [27]:
new_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14758113 entries, 0 to 14758321
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   rideable_type       object 
 1   started_at          object 
 2   ended_at            object 
 3   start_station_name  object 
 4   end_station_name    object 
 5   start_lat           float64
 6   start_lng           float64
 7   end_lat             float64
 8   end_lng             float64
 9   member_casual       object 
 10  Year                int64  
dtypes: float64(4), int64(1), object(6)
memory usage: 6.6 GB


### Dealing with time data
Pandas has set the "started_at" and "ended_at" columns to an object type, rather than a date time object.
I'll need to change some of these columns to a date time object for lower memory usage and usability.

In [38]:
new_df["started_at"] = pd.to_datetime(new_df["started_at"],format = "%Y-%m-%d %H:%M:%S")
new_df["ended_at"] = pd.to_datetime(new_df["ended_at"],format = "%Y-%m-%d %H:%M:%S")

In [40]:
new_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14758113 entries, 0 to 14758321
Data columns (total 11 columns):
 #   Column              Dtype         
---  ------              -----         
 0   rideable_type       object        
 1   started_at          datetime64[ns]
 2   ended_at            datetime64[ns]
 3   start_station_name  object        
 4   end_station_name    object        
 5   start_lat           float64       
 6   start_lng           float64       
 7   end_lat             float64       
 8   end_lng             float64       
 9   member_casual       object        
 10  Year                int64         
dtypes: datetime64[ns](2), float64(4), int64(1), object(4)
memory usage: 4.8 GB
