In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uber-lyft-cab-prices/cab_rides.csv
/kaggle/input/uber-lyft-cab-prices/weather.csv
/kaggle/input/uber-lyft-cab-prices/Cab-Weather Data/Cab-Weather Data/cab_rides.txt
/kaggle/input/uber-lyft-cab-prices/Cab-Weather Data/Cab-Weather Data/weather.txt
/kaggle/input/uber-lyft-cab-prices/cab-weather data/Cab-Weather Data/cab_rides.txt
/kaggle/input/uber-lyft-cab-prices/cab-weather data/Cab-Weather Data/weather.txt


In [2]:
cab_rides = pd.read_csv("/kaggle/input/uber-lyft-cab-prices/cab_rides.csv")
weather = pd.read_csv("/kaggle/input/uber-lyft-cab-prices/weather.csv")

In [3]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6276 entries, 0 to 6275
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   temp        6276 non-null   float64
 1   location    6276 non-null   object 
 2   clouds      6276 non-null   float64
 3   pressure    6276 non-null   float64
 4   rain        894 non-null    float64
 5   time_stamp  6276 non-null   int64  
 6   humidity    6276 non-null   float64
 7   wind        6276 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 392.4+ KB


In [4]:
# convert time features
cab_rides["datetime"] = pd.to_datetime(cab_rides["time_stamp"], unit='ms')
weather["datetime"] = pd.to_datetime(weather["time_stamp"], unit='s')

In [5]:
cab_rides.sample(3)

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,datetime
55406,2.15,Lyft,1543517284610,South Station,North Station,27.5,1.0,7766d810-e959-44fc-a932-35c87e773d5f,lyft_luxsuv,Lux Black XL,2018-11-29 18:48:04.610
156546,2.12,Uber,1544723110642,South Station,North Station,8.5,1.0,d89cfc7e-f44d-4acd-aa6a-f2f281208fc2,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,2018-12-13 17:45:10.642
158083,1.16,Uber,1543586581243,Theatre District,Haymarket Square,17.0,1.0,e7877dfa-3eed-4ae8-b641-019ba56baf36,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-11-30 14:03:01.243


In [6]:
# create date features
def extract_date_features(data):
    year = data.dt.year
    month = data.dt.month
    day = data.dt.day
    day_name = data.dt.day_name()
    hour = data.dt.hour
    
    return year, month, day, day_name, hour

In [7]:
year, month, day, day_name, hour = extract_date_features(cab_rides["datetime"])
cab_rides["date"] = weather["datetime"].dt.date
cab_rides["year"] = year
cab_rides["month"] = month
cab_rides["day_name"] = day_name
cab_rides["day"] = day
cab_rides["hour"] = hour

In [8]:
rides_per_type = cab_rides.groupby("cab_type").count()["distance"].reset_index()
fig = px.pie(rides_per_type,
            values="distance")

fig.show(renderer='iframe_connected')

# Pick-up Points

In [9]:
pick_up = cab_rides.groupby(["source", "cab_type"]).count()["distance"].reset_index()
pick_up.rename(columns={'distance':'pick-up count'}, inplace=True)
fig = px.sunburst(pick_up, path=['cab_type', 'source'], values='pick-up count',
                  color='cab_type')
fig.show(renderer='iframe_connected')

In [10]:
dest = cab_rides.groupby(["destination", "cab_type"]).count()["distance"].reset_index()
dest.rename(columns={'distance':'des count'}, inplace=True)
fig = px.sunburst(dest, path=['cab_type', 'destination'], values='des count',
                  color='cab_type')
fig.show(renderer='iframe_connected')

In [11]:
cab_rides["routes"] = cab_rides["source"]+" - " +cab_rides["destination"]
routes = cab_rides.groupby(["routes", "cab_type" ]).agg({
    "price":"mean",
    "id":"count",
}).reset_index()
routes.rename(columns={"id":"ride_count"}, inplace=True)

In [12]:
fig = px.scatter(routes,
                x="ride_count",
                y="price",
                marginal_x='histogram',
                marginal_y='box',
                color="cab_type").update_layout(template="plotly_white")
fig.show(renderer='iframe_connected')

In [13]:
cab_rides.sample(3)

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,datetime,date,year,month,day_name,day,hour,routes
465521,2.61,Uber,1544855406783,Beacon Hill,Northeastern University,15.0,1.0,8a7a04da-5706-4aa6-b1a9-9ac269ebf218,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-12-15 06:30:06.783,,2018,12,Saturday,15,6,Northeastern University - Beacon Hill
471921,3.01,Uber,1543721585458,Boston University,North Station,11.0,1.0,57c63ce4-b64a-49a6-9f6c-7392359bbba7,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,2018-12-02 03:33:05.458,,2018,12,Sunday,2,3,North Station - Boston University
194880,4.53,Lyft,1545059403753,Fenway,Financial District,32.5,1.0,b83e54b6-154d-450f-81c7-ccffc888cb99,lyft_lux,Lux Black,2018-12-17 15:10:03.753,,2018,12,Monday,17,15,Financial District - Fenway


In [14]:
monthly_avg_rides = cab_rides.groupby(["day", "day_name", "hour","surge_multiplier"]).agg({
    "id":"count",
    "distance":"mean",
    "price":"mean"
}).reset_index()
monthly_avg_rides

Unnamed: 0,day,day_name,hour,surge_multiplier,id,distance,price
0,1,Saturday,0,1.00,1818,2.153174,16.129177
1,1,Saturday,0,1.25,34,2.210294,22.955882
2,1,Saturday,0,1.50,6,1.803333,22.500000
3,1,Saturday,0,1.75,14,1.780000,28.392857
4,1,Saturday,1,1.00,1835,2.174005,16.413834
...,...,...,...,...,...,...,...
1726,30,Friday,23,1.00,1825,2.103244,15.921241
1727,30,Friday,23,1.25,27,2.764074,23.370370
1728,30,Friday,23,1.50,10,2.310000,29.150000
1729,30,Friday,23,1.75,6,2.150000,27.416667


In [15]:
daily_average_rides = monthly_avg_rides.groupby(["day_name", "hour","surge_multiplier"]).agg({
    "id":"mean",
    "distance":"mean",
    "price":"mean"
}).reset_index()
daily_average_rides.rename(columns={"id":"ride_count"}, inplace=True)
daily_average_rides["hour"] = daily_average_rides["hour"].astype(str)
# daily_average_rides = daily_average_rides.round(2)
daily_average_rides.head()

Unnamed: 0,day_name,hour,surge_multiplier,ride_count,distance,price
0,Friday,0,1.0,1813.5,2.147132,15.926382
1,Friday,0,1.25,28.5,2.333426,24.241667
2,Friday,0,1.5,12.0,2.703333,26.354167
3,Friday,0,1.75,6.0,2.381111,32.416667
4,Friday,0,2.0,11.5,2.650437,42.28373


In [16]:
surge_pivot = pd.pivot_table(daily_average_rides,
                            values="surge_multiplier",
                            index="day_name",
                            columns="hour")
fig = px.imshow(surge_pivot, color_continuous_scale="RdBu_r",text_auto=True)
fig.show(renderer='iframe_connected')

In [17]:
price_pivot = pd.pivot_table(daily_average_rides,
                            values="price",
                            index="day_name",
                            columns="hour")
fig = px.imshow(price_pivot, color_continuous_scale="viridis",text_auto=True).update_layout(template="plotly_white")
fig.show(renderer='iframe_connected')

In [18]:
distance_pivot = pd.pivot_table(daily_average_rides,
                            values="distance",
                            index="day_name",
                            columns="hour")
fig = px.imshow(distance_pivot, color_continuous_scale="viridis",text_auto=True).update_layout(template="plotly_white")
fig.show(renderer='iframe_connected')

In [19]:
ride_pivot = pd.pivot_table(daily_average_rides,
                            values="ride_count",
                            index="day_name",
                            columns="hour")
# lineplot version
# fig = px.line(daily_average_rides,
#              x="hour",
#              y="ride_count",
#              color="day_name").update_layout(template="plotly_white")
fig = px.imshow(ride_pivot, color_continuous_scale="viridis",text_auto=True).update_layout(template="plotly_white")
fig.show(renderer='iframe_connected')