# Load Data

In this project, we will be working with Uber and Lyft data. The dataset can be found here:

https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma

# Data Cleaning

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing 

In [127]:
uber_data = pd.read_csv(r"rideshare_kaggle.csv")

In [128]:
uber_data['name'].value_counts()

name
UberXL          55096
WAV             55096
Black SUV       55096
Black           55095
Taxi            55095
UberX           55094
UberPool        55091
Lux             51235
Lyft            51235
Lux Black XL    51235
Lyft XL         51235
Lux Black       51235
Shared          51233
Name: count, dtype: int64

In [45]:
uber_data['name'].nunique()

13

In [47]:
uber_data.columns

Index(['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'timezone',
       'source', 'destination', 'cab_type', 'product_id', 'name', 'price',
       'distance', 'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperat

In [75]:
object_df = uber_data.select_dtypes("object")
object_df

Unnamed: 0,id,datetime,timezone,source,destination,cab_type,product_id,name,short_summary,long_summary,icon
0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,2018-12-16 09:30:07,America/New_York,Haymarket Square,North Station,Lyft,lyft_line,Shared,Mostly Cloudy,Rain throughout the day.,partly-cloudy-night
1,4bd23055-6827-41c6-b23b-3c491f24e74d,2018-11-27 02:00:23,America/New_York,Haymarket Square,North Station,Lyft,lyft_premier,Lux,Rain,"Rain until morning, starting again in the eve...",rain
2,981a3613-77af-4620-a42a-0c0866077d1e,2018-11-28 01:00:22,America/New_York,Haymarket Square,North Station,Lyft,lyft,Lyft,Clear,Light rain in the morning.,clear-night
3,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,2018-11-30 04:53:02,America/New_York,Haymarket Square,North Station,Lyft,lyft_luxsuv,Lux Black XL,Clear,Partly cloudy throughout the day.,clear-night
4,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,2018-11-29 03:49:20,America/New_York,Haymarket Square,North Station,Lyft,lyft_plus,Lyft XL,Partly Cloudy,Mostly cloudy throughout the day.,partly-cloudy-night
...,...,...,...,...,...,...,...,...,...,...,...
693066,616d3611-1820-450a-9845-a9ff304a4842,2018-12-01 23:53:05,America/New_York,West End,North End,Uber,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,Partly Cloudy,Light rain in the morning and overnight.,partly-cloudy-night
693067,633a3fc3-1f86-4b9e-9d48-2b7132112341,2018-12-01 23:53:05,America/New_York,West End,North End,Uber,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,Partly Cloudy,Light rain in the morning and overnight.,partly-cloudy-night
693068,64d451d0-639f-47a4-9b7c-6fd92fbd264f,2018-12-01 23:53:05,America/New_York,West End,North End,Uber,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi,Partly Cloudy,Light rain in the morning and overnight.,partly-cloudy-night
693069,727e5f07-a96b-4ad1-a2c7-9abc3ad55b4e,2018-12-01 23:53:05,America/New_York,West End,North End,Uber,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,Partly Cloudy,Light rain in the morning and overnight.,partly-cloudy-night


In [79]:
num_df = uber_data.select_dtypes("number")
num_df

Unnamed: 0,timestamp,hour,day,month,price,distance,surge_multiplier,latitude,longitude,temperature,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
0,1.544953e+09,9,16,12,5.0,0.44,1.0,42.2148,-71.0330,42.34,...,0.1276,1544979600,39.89,1545012000,43.68,1544968800,33.73,1545012000,38.07,1544958000
1,1.543284e+09,2,27,11,11.0,0.44,1.0,42.2148,-71.0330,43.58,...,0.1300,1543251600,40.49,1543233600,47.30,1543251600,36.20,1543291200,43.92,1543251600
2,1.543367e+09,1,28,11,7.0,0.44,1.0,42.2148,-71.0330,38.33,...,0.1064,1543338000,35.36,1543377600,47.55,1543320000,31.04,1543377600,44.12,1543320000
3,1.543554e+09,4,30,11,26.0,0.44,1.0,42.2148,-71.0330,34.38,...,0.0000,1543507200,34.67,1543550400,45.03,1543510800,30.30,1543550400,38.53,1543510800
4,1.543463e+09,3,29,11,9.0,0.44,1.0,42.2148,-71.0330,37.44,...,0.0001,1543420800,33.10,1543402800,42.18,1543420800,29.11,1543392000,35.75,1543420800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693066,1.543708e+09,23,1,12,13.0,1.00,1.0,42.3519,-71.0643,37.05,...,0.0000,1543683600,31.42,1543658400,44.76,1543690800,27.77,1543658400,44.09,1543690800
693067,1.543708e+09,23,1,12,9.5,1.00,1.0,42.3519,-71.0643,37.05,...,0.0000,1543683600,31.42,1543658400,44.76,1543690800,27.77,1543658400,44.09,1543690800
693068,1.543708e+09,23,1,12,,1.00,1.0,42.3519,-71.0643,37.05,...,0.0000,1543683600,31.42,1543658400,44.76,1543690800,27.77,1543658400,44.09,1543690800
693069,1.543708e+09,23,1,12,27.0,1.00,1.0,42.3519,-71.0643,37.05,...,0.0000,1543683600,31.42,1543658400,44.76,1543690800,27.77,1543658400,44.09,1543690800


In [81]:
extra_cols = [ 'product_id','timezone','latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability','apparentTemperature', 'precipIntensity' ,
       'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax', 'apparentTemperatureMaxTime'
]

uber_data = uber_data.drop(extra_cols, axis=1)              

In [83]:
uber_data.shape

(693071, 13)

In [85]:
uber_data.columns

Index(['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'source',
       'destination', 'cab_type', 'name', 'price', 'distance',
       'surge_multiplier'],
      dtype='object')

In [87]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                693071 non-null  object 
 1   timestamp         693071 non-null  float64
 2   hour              693071 non-null  int64  
 3   day               693071 non-null  int64  
 4   month             693071 non-null  int64  
 5   datetime          693071 non-null  object 
 6   source            693071 non-null  object 
 7   destination       693071 non-null  object 
 8   cab_type          693071 non-null  object 
 9   name              693071 non-null  object 
 10  price             637976 non-null  float64
 11  distance          693071 non-null  float64
 12  surge_multiplier  693071 non-null  float64
dtypes: float64(4), int64(3), object(6)
memory usage: 68.7+ MB


In [61]:
uber_data.describe()

Unnamed: 0,timestamp,hour,day,month,price,distance,surge_multiplier
count,693071.0,693071.0,693071.0,693071.0,637976.0,693071.0,693071.0
mean,1544046000.0,11.619137,17.794365,11.586684,16.545125,2.18943,1.01387
std,689192.5,6.948114,9.982286,0.492429,9.324359,1.138937,0.091641
min,1543204000.0,0.0,1.0,11.0,2.5,0.02,1.0
25%,1543444000.0,6.0,13.0,11.0,9.0,1.28,1.0
50%,1543737000.0,12.0,17.0,12.0,13.5,2.16,1.0
75%,1544828000.0,18.0,28.0,12.0,22.5,2.92,1.0
max,1545161000.0,23.0,30.0,12.0,97.5,7.86,3.0


In [89]:
print(uber_data.isnull().sum())
print(uber_data.isnull().sum().sum())
uber_dataset=uber_data.dropna()

id                      0
timestamp               0
hour                    0
day                     0
month                   0
datetime                0
source                  0
destination             0
cab_type                0
name                    0
price               55095
distance                0
surge_multiplier        0
dtype: int64
55095


In [95]:
uber_dataset.shape

(637976, 14)

# Converting dataset into CSV format

In [117]:
uber_dataset.to_csv(r'data.csv')

In [118]:
uber_dataset = pd.read_csv(r"data.csv")
uber_dataset

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,timestamp,hour,day,month,datetime,source,destination,cab_type,name,price,distance,surge_multiplier
0,0,0,0,0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,1.544953e+09,9,16,12,2018-12-16 09:30:07,Haymarket Square,North Station,Lyft,Shared,5.0,0.44,1.0
1,1,1,1,1,4bd23055-6827-41c6-b23b-3c491f24e74d,1.543284e+09,2,27,11,2018-11-27 02:00:23,Haymarket Square,North Station,Lyft,Lux,11.0,0.44,1.0
2,2,2,2,2,981a3613-77af-4620-a42a-0c0866077d1e,1.543367e+09,1,28,11,2018-11-28 01:00:22,Haymarket Square,North Station,Lyft,Lyft,7.0,0.44,1.0
3,3,3,3,3,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,1.543554e+09,4,30,11,2018-11-30 04:53:02,Haymarket Square,North Station,Lyft,Lux Black XL,26.0,0.44,1.0
4,4,4,4,4,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,1.543463e+09,3,29,11,2018-11-29 03:49:20,Haymarket Square,North Station,Lyft,Lyft XL,9.0,0.44,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637971,637971,637971,637971,693065,353e6566-b272-479e-a9c6-98bd6cb23f25,1.543708e+09,23,1,12,2018-12-01 23:53:05,West End,North End,Uber,WAV,9.5,1.00,1.0
637972,637972,637972,637972,693066,616d3611-1820-450a-9845-a9ff304a4842,1.543708e+09,23,1,12,2018-12-01 23:53:05,West End,North End,Uber,UberXL,13.0,1.00,1.0
637973,637973,637973,637973,693067,633a3fc3-1f86-4b9e-9d48-2b7132112341,1.543708e+09,23,1,12,2018-12-01 23:53:05,West End,North End,Uber,UberX,9.5,1.00,1.0
637974,637974,637974,637974,693069,727e5f07-a96b-4ad1-a2c7-9abc3ad55b4e,1.543708e+09,23,1,12,2018-12-01 23:53:05,West End,North End,Uber,Black SUV,27.0,1.00,1.0
