In [1]:
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.legend import Legend

# File to Load (Remember to change these)
city_data_to_load = "data/city_data.csv"
ride_data_to_load = "data/ride_data.csv"

# Read the City and Ride Data
city_data = pd.read_csv(city_data_to_load)
ride_data = pd.read_csv(ride_data_to_load)

In [2]:
city_data.head() 

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban


In [3]:
city_data.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [4]:
ride_data.head() 

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344


In [5]:
ride_data.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [6]:
city_data.count()

city            120
driver_count    120
type            120
dtype: int64

In [7]:
ride_data.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [8]:
# Combine the data into a single dataset
merged_data = pd.merge( ride_data,city_data,on = "city", how = "left")

# Display the data table for preview
merged_data.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344,46,Urban


In [9]:
# check if data number is correct (should be 2375)

merged_data.count()

city            2375
date            2375
fare            2375
ride_id         2375
driver_count    2375
type            2375
dtype: int64

In [10]:
merged_data.dtypes

city             object
date             object
fare            float64
ride_id           int64
driver_count      int64
type             object
dtype: object

# Bubble Plot of Ride Sharing Data

In [31]:
merged_data

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344,46,Urban
5,South Latoya,2018-03-11 12:26:48,9.52,1994999424437,10,Urban
6,New Paulville,2018-02-27 11:17:56,43.25,793208410091,44,Urban
7,Simpsonburgh,2018-04-26 00:43:24,35.98,111953927754,21,Urban
8,South Karenland,2018-01-08 03:28:48,35.09,7995623208694,4,Urban
9,North Jasmine,2018-03-09 06:26:29,42.81,5327642267789,33,Urban


In [30]:
# group by city
groupby_type_city = merged_data.groupby(["type","city"]).agg({"fare":['sum','mean'],"ride_id":['count']})
#groupby_type_city = groupby_type_city()
groupby_type_city.head()
groupby_type_city

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,ride_id
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,count
type,city,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Rural,Bradshawfurt,400.64,40.064000,10
Rural,Garzaport,72.37,24.123333,3
Rural,Harringtonfort,200.82,33.470000,6
Rural,Jessicaport,216.08,36.013333,6
Rural,Lake Jamie,206.15,34.358333,6
Rural,Lake Latoyabury,286.68,26.061818,11
Rural,Michaelberg,419.97,34.997500,12
Rural,New Ryantown,259.67,43.278333,6
Rural,Newtonview,146.98,36.745000,4
Rural,North Holly,262.17,29.130000,9


In [22]:
 # calculate average fare per city
groupby_type_city["average_fare_per_city"] = merged_data.groupby(["type","city"])["fare"].mean()
groupby_type_city.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,ride_id,driver_count,driver_count,average_fare_per_city
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,count,count,sum,Unnamed: 7_level_1
type,city,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Rural,Bradshawfurt,400.64,40.064,10,10,70,40.064
Rural,Garzaport,72.37,24.123333,3,3,21,24.123333
Rural,Harringtonfort,200.82,33.47,6,6,24,33.47
Rural,Jessicaport,216.08,36.013333,6,6,6,36.013333
Rural,Lake Jamie,206.15,34.358333,6,6,24,34.358333


In [23]:
# Calculate Total number of ride per city
groupby_type_city["ride_count_per_city"] = merged_data.groupby(["type","city"])["ride_id"].count()
groupby_type_city.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,ride_id,driver_count,driver_count,average_fare_per_city,ride_count_per_city
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,count,count,sum,Unnamed: 7_level_1,Unnamed: 8_level_1
type,city,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Rural,Bradshawfurt,400.64,40.064,10,10,70,40.064,10
Rural,Garzaport,72.37,24.123333,3,3,21,24.123333,3
Rural,Harringtonfort,200.82,33.47,6,6,24,33.47,6
Rural,Jessicaport,216.08,36.013333,6,6,6,36.013333,6
Rural,Lake Jamie,206.15,34.358333,6,6,24,34.358333,6


In [None]:
# Calculate Total number of ride per city
groupby_type_city["driver_count_per_city"] = merged_data.groupby(["type","city"])["ride_id"].count()
groupby_type_city.head()

In [18]:
groupby_type_city.reset_index()
groupby_type_city.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,ride_id,average_fare_per_city
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,count,Unnamed: 5_level_1
type,city,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Rural,Bradshawfurt,400.64,40.064,10,40.064
Rural,Garzaport,72.37,24.123333,3,24.123333
Rural,Harringtonfort,200.82,33.47,6,33.47
Rural,Jessicaport,216.08,36.013333,6,36.013333
Rural,Lake Jamie,206.15,34.358333,6,34.358333


In [34]:
final_data = pd.merge(groupby_type_city,city_data, on ='city',how='left' )
final_data.head()

Unnamed: 0,city,"(fare, sum)","(fare, mean)","(ride_id, count)",driver_count,type
0,Bradshawfurt,400.64,40.064,10,7,Rural
1,Garzaport,72.37,24.123333,3,7,Rural
2,Harringtonfort,200.82,33.47,6,4,Rural
3,Jessicaport,216.08,36.013333,6,1,Rural
4,Lake Jamie,206.15,34.358333,6,4,Rural


In [19]:
# reset index to be able to access 'type'
#groupby_type_city = groupby_type_city.reset_index()


In [None]:
# extract urban data
urban = groupby_type_city.loc[groupby_type_city['type']=='Urban',['driver_count','average_fare_per_city','ride_count_per_city']]
urban.head()

In [None]:
# extract suburban data
suburban = groupby_type_city.loc[groupby_type_city['type']=='Suburban',['driver_count','average_fare_per_city','ride_count_per_city']]
suburban.head()

In [None]:
# extract rural data
rural = groupby_type_city.loc[groupby_type_city['type']=='Rural',['driver_count','average_fare_per_city','ride_count_per_city']]
rural

In [None]:
# Build the scatter plots for each city types
plt.grid()


plt.scatter(urban["ride_count_per_city"], urban["average_fare_per_city"],  \
            marker="o", facecolors="salmon", edgecolors="black", s=urban["driver_count"], alpha=0.5, label=None)

plt.scatter(suburban["ride_count_per_city"], suburban["average_fare_per_city"], \
            marker="o", facecolors="skyblue", edgecolors="black", s=suburban["driver_count"], alpha=0.5 , label=None
          )

plt.scatter(rural["ride_count_per_city"], rural["average_fare_per_city"], \
            marker="o",facecolors="orange", edgecolors="black",s=rural["driver_count"], alpha=0.5 , label=None)

# create fake data for the legend
plt.scatter(0, 0,  \
            marker="o", facecolors="salmon", edgecolors="black", s=30, alpha=0.5, label = "Urban") 

plt.scatter(0, 0,  \
            marker="o", facecolors="skyblue", edgecolors="black", s=30, alpha=0.5, label = "Suburban") 

plt.scatter(0, 0, marker="o", facecolors="orange", edgecolors="black", s=30, alpha=0.5, label = "Rural") 

# add legend
plt.legend(numpoints=1, loc=1, borderpad=1, \
            frameon=True, framealpha=0.9, title="City type",scatterpoints = 1)

# set xlim and ylim
plt.ylim(19, 45)
plt.xlim(0, 41)

# add text outside the graph
# Incorporate a text label regarding circle size
plt.text(42,35,"test", fontsize=12)

# Incorporate the other graph properties
plt.title("Pyber ride Sharing Data (2016)")
plt.xlabel("Total number of rides (per city)")
plt.ylabel("Average fare ($)")
plt.tight_layout()
plt.show()

In [None]:
# Save Figure


## Total Fares by City Type

In [None]:
# Calculate Type Percents

# Build Pie Chart

# Save Figure


In [None]:
# Show Figure
plt.show()

## Total Rides by City Type

In [None]:
# Calculate Ride Percents

# Build Pie Chart

# Save Figure


In [None]:
# Show Figure
plt.show()

## Total Drivers by City Type

In [None]:
# Calculate Driver Percents

# Build Pie Charts

# Save Figure


In [None]:
# Show Figure
plt.show()