In [66]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [67]:
df = pd.read_csv("./cycling data.csv")
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,A847FADBBC638E45,docked_bike,2020-04-26 17:45,2020-04-26 18:12,Eckhart Park,86,Lincoln Ave & Diversey Pkwy,152.0,41.8964,-87.661,41.9322,-87.6586,member
1,5405B80E996FF60D,docked_bike,2020-04-17 17:08,2020-04-17 17:17,Drake Ave & Fullerton Ave,503,Kosciuszko Park,499.0,41.9244,-87.7154,41.9306,-87.7238,member
2,5DD24A79A4E006F4,docked_bike,2020-04-01 17:54,2020-04-01 18:08,McClurg Ct & Erie St,142,Indiana Ave & Roosevelt Rd,255.0,41.8945,-87.6179,41.8679,-87.623,member
3,2A59BBDF5CDBA725,docked_bike,2020-04-07 12:50,2020-04-07 13:02,California Ave & Division St,216,Wood St & Augusta Blvd,657.0,41.903,-87.6975,41.8992,-87.6722,member
4,27AD306C119C6158,docked_bike,2020-04-18 10:22,2020-04-18 11:15,Rush St & Hubbard St,125,Sheridan Rd & Lawrence Ave,323.0,41.8902,-87.6262,41.9695,-87.6547,casual


### Data Cleaning

#### Handling Missing Values

In [68]:
# Size of dataframe
number_of_rows = df.shape[0]

# Check for null values column-wise
column_null_counts = df.isnull().sum()

# Display the result
print("Number of Rows in DataFrame:", number_of_rows)
print('--------------------')
print(column_null_counts)

Number of Rows in DataFrame: 84776
--------------------
ride_id                0
rideable_type          0
started_at             0
ended_at               0
start_station_name     0
start_station_id       0
end_station_name      99
end_station_id        99
start_lat              0
start_lng              0
end_lat               99
end_lng               99
member_casual          0
dtype: int64


In [69]:
# Action: Removed rows with missing values to ensure data completeness.
# Justification: The quantity of missing values is negligible in comparison to the total number of rows in the dataframe
df_cleaned = df.dropna()

In [70]:
# Size of dataframe
number_of_rows = df_cleaned.shape[0]

# Check for null values column-wise
column_null_counts = df_cleaned.isnull().sum()

# Display the result
print("Number of Rows in DataFrame:", number_of_rows)
print('--------------------')
print(column_null_counts)

Number of Rows in DataFrame: 84677
--------------------
ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64


#### Handling Duplicate Rows

In [71]:
# Check for duplicates across all columns
duplicates = df_cleaned.duplicated()
duplicate_rows = df_cleaned[df_cleaned.duplicated()]

# Display the result
print(duplicates)
print('--------------------')
print(duplicate_rows)

0        False
1        False
2        False
3        False
4        False
         ...  
84771    False
84772    False
84773    False
84774    False
84775    False
Length: 84677, dtype: bool
--------------------
Empty DataFrame
Columns: [ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual]
Index: []


#### Handling Data Types

In [72]:
# Data types of columns
data_types = df_cleaned.dtypes

# Display the result
print(data_types)

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id        int64
end_station_name       object
end_station_id        float64
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object


In [73]:
# Converting 'started_at' column to datetime
df_cleaned['started_at'] = pd.to_datetime(df_cleaned['started_at'])

# Converting 'ended_at' column to datetime
df_cleaned['ended_at'] = pd.to_datetime(df_cleaned['ended_at'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['started_at'] = pd.to_datetime(df_cleaned['started_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ended_at'] = pd.to_datetime(df_cleaned['ended_at'])


In [74]:
# Checking conversion
data_types = df_cleaned.dtypes

# Display the result
print(data_types)

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id               int64
end_station_name              object
end_station_id               float64
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object


In [75]:
# Exporting cleaning data for exploratory data analysis
df_cleaned.to_csv('cleaned_data.csv', index=False)
