In [1]:
#Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress

%matplotlib inline
import seaborn as sns

In [2]:
#Refrence to Csv files 
cab_ride_df = pd.read_csv("Cab-Weather-Data/cab_rides.csv")
weather_df = pd.read_csv("Cab-Weather-Data/weather.csv")
 

In [3]:
# Cleaning Weather Data
# The timestamps were listed in epoch formatting, and needs to be converted to a readable date format

weather_df["date"] = pd.to_datetime(weather_df["time_stamp"], unit="s")

# Reorder the columns for readability
clean_weather_df = weather_df[["date","location","temp","clouds","pressure","rain","humidity","wind"]]


# Reset the index and change the location header to source. This is so our merge will align on common time point & weather conditions at pick up location
new_weather_df = clean_weather_df.reset_index(drop=True)
new_weather_df.rename(columns={"location":"source"}, inplace=True)
new_weather_df.head()

In [4]:
# Cleaning Car Ride Data
# The epoch timestamp in this table is represented in milliseconds, so we divide by 1000 to find the time in value of seconds
cab_ride_df["date"]=pd.to_datetime(cab_ride_df["time_stamp"]/1000, unit="s")

# Removing unwanted columns like id and product_id
clean_cab_df = cab_ride_df[["date","source","destination","distance","price","surge_multiplier","cab_type", "name"]]

# Resetting the index
new_cab_df=clean_cab_df.reset_index(drop=True)
new_cab_df.head()


In [5]:
#Merge the datesets into a single one based on time_stamp and location
cleaned_merged_df = pd.merge_asof(new_cab_df.sort_values("date"), new_weather_df.sort_values("date"), 
                            left_by=  ['source' ],right_by= ['source' ] ,
                            left_on=  ['date']   ,right_on= ['date'], tolerance=pd.Timedelta(minutes = 15 ) )
cleaned_merged_df.head()



In [6]:
#dataframe info 
cleaned_merged_df.describe


In [7]:
# To ensure we have all rows that have weather conditions present, we want to drop all null values in the Temperature column
# Null values for rain or wind is ok because these are logical weather conditions
final_df=cleaned_merged_df.dropna(subset=["temp"])
final_df=final_df.reset_index(drop=True)

# With the null values in other rows, we can fill them with "0"
final_df.fillna(0,inplace=True)
final_df.head()

In [8]:
#Count the numbers of temp 
final_df["temp"].count

In [9]:
# Adding the day of the week to our table to see if that may have any correlation to weather
final_df["day_of_week"]=final_df["date"].dt.day_name()
final_df.head()

In [10]:
#Adding Month to the table 
final_df["month"] = final_df['date'].dt.month
final_df.head()

In [11]:
# Formatting some columns to show accurate values
final_df["clouds"]=final_df["clouds"]*100
final_df["humidity"]=final_df["humidity"]*100


In [12]:
# Pretty-fying our final dataframe by including units of measurement for easy analysis

final_df = final_df.rename(columns={"date":"Date","day_of_week":"Weekday","source":"Pick_Up_Location","destination":"Destination",
                                    "distance":"Distance(m)","price":"Price($)","surge_multiplier":"Surge_Multiplier",
                                    "cab_type":"Rideshare_App","name":"Ride_Type","temp":"Temp(C)",
                                    "clouds":"Cloudiness(%)","pressure":"Pressure(MB)","rain":"Rainfall(in)",
                                    "humidity":"Humidity(%)","wind":"Wind_Speed(mph)", "month":"Month"})

final_df = final_df[["Date","Month","Weekday","Pick_Up_Location","Destination","Distance(m)","Price($)","Surge_Multiplier","Rideshare_App","Ride_Type",
                     "Temp(C)","Cloudiness(%)","Pressure(MB)","Rainfall(in)","Humidity(%)","Wind_Speed(mph)"]]
final_df.head()

In [13]:
#count of Rides with price 0 
final_df [ final_df["Price($)"]==0]["Price($)"].count()

In [14]:
# Save the dataframe into a csv file to analyze further
final_df.to_csv("rideshare_vs_weather.csv", index=False)

In [15]:
#Pie chart created to see the percentage of lyft rides to Uber Rides
ride_counts = final_df["Rideshare_App"].value_counts()

percentages = (ride_counts / ride_counts.sum()) * 100

plt.pie(percentages, labels=percentages.index, autopct='%1.1f%%')
plt.title('Percentage of Uber Rides vs Lyft Rides') 
plt.axis('equal')
plt.show()

In [16]:
#find the min and max distance of the trips to find the ranges 

maximum_distance = final_df["Distance(m)"].max()
minimum_distance = final_df["Distance(m)"].min()

print("Maximum Distance:", maximum_distance)
print("Minimum Distance:", minimum_distance)

In [17]:
# Filter the DataFrame to include only Uber rides
uber_rides = final_df[final_df["Rideshare_App"] == "Uber"]

# Define the distance ranges
distance_ranges = ["0-2 miles", "2-4 miles", "4-6 miles", "6-8 miles"]

# Count the number of Uber rides within each distance range
ride_counts = [
    ((uber_rides["Distance(m)"] >= 0) & (uber_rides["Distance(m)"] <= 2)).sum(),
    ((uber_rides["Distance(m)"] > 2) & (uber_rides["Distance(m)"] <= 4)).sum(),
    ((uber_rides["Distance(m)"] > 4) & (uber_rides["Distance(m)"] <= 6)).sum(),
    ((uber_rides["Distance(m)"] > 6) & (uber_rides["Distance(m)"] <= 8)).sum()
]

# Plot the bar graph
plt.bar(distance_ranges, ride_counts)
plt.xlabel("Distance Range")
plt.ylabel("Number of Uber Rides")
plt.title("Number of Uber Rides by Distance Range")
plt.show()

In [18]:
# Filter the DataFrame to include only Lyft rides
lyft_rides = final_df[final_df["Rideshare_App"] == "Lyft"]

# Define the distance ranges
distance_ranges = ["0-2 miles", "2-4 miles", "4-6 miles", "6-8 miles"]

# Count the number of Lyft rides within each distance range
ride_counts = [
    ((lyft_rides["Distance(m)"] >= 0) & (lyft_rides["Distance(m)"] <= 2)).sum(),
    ((lyft_rides["Distance(m)"] > 2) & (lyft_rides["Distance(m)"] <= 4)).sum(),
    ((lyft_rides["Distance(m)"] > 4) & (lyft_rides["Distance(m)"] <= 6)).sum(),
    ((lyft_rides["Distance(m)"] > 6) & (lyft_rides["Distance(m)"] <= 8)).sum()
]

# Plot the bar graph
plt.bar(distance_ranges, ride_counts)
plt.xlabel("Distance Range")
plt.ylabel("Number of Lyft Rides")
plt.title("Number of Lyft Rides by Distance Range")
plt.show()

In [19]:
# Total revenue for both lyft and uber
revenue_df=final_df.groupby("Rideshare_App")["Price($)"].sum()
revenue_df


In [20]:
#Bar plot showing the revenue base on both rideshare apps
revenue_df.plot(kind='bar')
plt.title("Revenue Comparison for Lyft and Uber")
plt.xlabel("Rideshare App")
plt.ylabel("Revenue in USD")
plt.show()

In [21]:
#Lyft Monthly Revenue 
lyft=final_df[final_df["Rideshare_App"]=="Lyft"]
lyft_monthly_rev=lyft.groupby("Month")["Price($)"].sum()
lyft_monthly_rev

In [22]:
#Uber Monthly Revenue 
uber=final_df[final_df["Rideshare_App"]=="Uber"]
uber_monthly_rev=uber.groupby(["Month"])["Price($)"].sum()
uber_monthly_rev

In [23]:
#Created Dataframe to show revenue based on each rideshare app
monthly_rev_df=pd.DataFrame({"Lyft":lyft_monthly_rev,"Uber":uber_monthly_rev})
monthly_rev_df

In [24]:
#Bar chart to show revenue based on months
ax=monthly_rev_df.plot.bar()
labels=['Nov','Dec']
ax.set_xticklabels(labels)
plt.title("Monthly Revenue Comparison for Lyft and Uber")
plt.xlabel("Rideshare App")
plt.ylabel("Revenue in USD")
plt.show()

# Statistical analysis on Price based on Each App

In [25]:
#Generate a summary statistics of mean, median, variance, standard deviation, and SEM of the price for Lyft and Uber 

ride_app = final_df.groupby("Rideshare_App")

count_rides = ride_app["Price($)"].count()
mean_price = ride_app["Price($)"].mean()
median_price = ride_app["Price($)"].median()
var_price = ride_app["Price($)"].var()
std_price = ride_app["Price($)"].std()
sem_price = ride_app["Price($)"].sem()
min_price = ride_app["Price($)"].min()
max_price = ride_app["Price($)"].max()
price_rideApp_df = pd.DataFrame({"Price Mean":mean_price , 
                            "Price Median ":median_price,
                             "Price Variance":var_price,
                             "Price Std. Dev.":std_price,
                             "Price Std. Err.":sem_price,
                                "Total rides" :count_rides,
                                "Max Price" : max_price,
                                "Min Price": min_price})
price_rideApp_df

### Maryam's Analysis:
Lyft has higher price on avaerage (17.34 vs.13.55  )and the Standard deviation is higher (10.02 vs. 9.66). 
Lyft max price is 92.0 and the min price is 2.5.
Uber max price is 89.5 and the min price is 0.0.

# Determin any pattern change of the average price on each day of the week base on each App
###  Plot Line bar to show the comparison 

In [26]:
# Create a sorted value list to sort day of the week result
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


In [27]:
# higehst price of the Uber rides / indexed based on cats variable
uber_rides = final_df[ final_df["Rideshare_App"]=="Uber"]
uber_highestday_price=uber_rides.groupby(["Weekday"])["Price($)"].mean().reindex(cats)
uber_highestday_price

In [28]:
# higehst price of the Lyft rides / indexed based on cats variable
lyft_rides = final_df[ final_df["Rideshare_App"]=="Lyft"]
lyft_highestday_price=lyft_rides.groupby(["Weekday"])["Price($)"].mean().reindex(cats)
lyft_highestday_price


In [29]:
#created the line chart to show price difference based on most expensive day of the week for each ride app 
plt.figure(figsize=(16, 8), dpi=150)
uber_highestday_price.plot(color ="r", label ="uber highest price perday")
lyft_highestday_price.plot(color ="b", label ="lyft highest price perday")
plt.legend()


### Maryam's Analysis:
Avaerage of the price for each app on different days of the week is relatively constant and does not change a lot. 

# Determin Price outliers for each App and plot Boxplot 

In [30]:
# create rides into list for loop
ride_list = ["Lyft", "Uber"]

#Create empty list to fill with price data (for plotting)
price_list =[]

for ride in ride_list:
    
    price = final_df[ final_df["Rideshare_App"]==ride]["Price($)"] 
    
    price_list.append(price)
    
    # Determine outliers using upper and lower bounds
    quartiles = price.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    # print if the counter is empty and add a space between each regimen
    
   
    print(f"The IQR data for {ride} is:")
    print(f"The lower quartile of {ride} is: {lowerq}")
    print(f"The upper quartile of {ride} is: {upperq}")
    print(f"The interquartile range of {ride} is: {iqr}")
    print(f"The the median of {ride} is: {quartiles[0.5]}")
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")
    print(f"")
   


In [31]:
# Generate a box plot that shows the distrubution of the price based on each ride app.
flierprops = dict(marker='o', markerfacecolor='r', markersize=10, markeredgecolor='black')

fig1, ax1 = plt.subplots()
ax1.set_ylabel(" Price $")
ax1.set_xlabel("Ride")
ax1.boxplot(price_list, flierprops=flierprops)
ax1.set_xticklabels(ride_list)

plt.show()

# Plot Heatmap 
#### 1- Correlation among All the Data in the  DataFrame
#### 2- Correlation among Weather condition and Price 
#### 3- Correlation among Daily weather conditions and Revenue 

In [32]:
# Plot the correlation heat map among numeric columns
# calculate the correlation matrix on the numeric columns
corr = final_df.select_dtypes('number').corr()

# plot the heatmap
sns.heatmap(corr)

In [33]:
# calculate the correlation matrix on the wehater conditions and price
conditions_list = ["Temp(C)","Cloudiness(%)","Pressure(MB)","Rainfall(in)","Humidity(%)","Wind_Speed(mph)","Price($)"]

corr = final_df.loc[:,conditions_list].select_dtypes('number').corr()

# plot the heatmap
sns.heatmap(corr)

In [34]:
#aggregate the data to calculate the revenue and get the mean for the rest for each specific day 

#create a day field 
final_df["day"] = final_df["Date"].dt.date
by_date_df = final_df.groupby("day")
#create revenue and add a column called revenue to by_date dataframe

#aggregate the price column to get the revenue 
revenue = by_date_df["Price($)"].sum()
by_date_df = by_date_df.mean()
by_date_df["revenue"] = revenue


by_date_df


In [35]:
# calculate the correlation matrix on revenue based on the wehater conditions on specific days
conditions_list = ["Temp(C)","Cloudiness(%)","Pressure(MB)","Rainfall(in)","Humidity(%)","Wind_Speed(mph)","Distance(m)","revenue"]

corr = by_date_df.loc[:,conditions_list].select_dtypes('number').corr()


# plot the heatmap
sns.heatmap(corr)

### Maryam's Analysis:
    Revenue has the highest correlation with Rainfall. The Rainfall, Cloudiness, Humidity, Wind_Speed have the positive effect on revenue. The distance has the negative effect, because the bulk of the price comes from the initial charge on each ride per day.  
    So, The best strategy to increase revenue is to focuse on more short distance rides. 


# Linear Regression of Distance Vs. Average of Price (Lyft): 

In [36]:
#Bining the distance
bins = [0,1,2,3,4,5,6,7,8]
final_df["dist"] =  pd.cut(final_df["Distance(m)"].astype(float),bins)
final_df[["Distance(m)","dist"]].head(20)


In [37]:
#Study the relationship between price and the distance
# Calculate the correlation coefficient and a linear regression model 
#  for price and distance for the entire rides based on different app


uber_rides=final_df.loc[final_df["Rideshare_App"] =="Uber"]
uber_rides = uber_rides.groupby("dist").mean()

# Count the number of Lyft rides within each distance range

uber_price = uber_rides["Price($)"]
uber_distance = uber_rides["Distance(m)"]

corr = round(st.pearsonr(uber_distance, uber_price )[0],2)
print(f"The correlation between price of Uber rides and the Distance  {corr}")


#linear regression model
linear_reg = st.linregress( uber_rides["Distance(m)"],uber_rides["Price($)"])
linear_reg



In [38]:
# plot the regression line on the scatter plot
uber_rides=final_df.loc[final_df["Rideshare_App"] =="Uber"]


plt.scatter(uber_distance, uber_price)

#linear regression equation and plot the line
uber_price_predict = uber_distance * linear_reg.slope + linear_reg.intercept
plt.plot(uber_distance, uber_price_predict, color='red')
line_eq = "y = " + str(round(linear_reg.slope,2)) + "x + " + str(round(linear_reg.intercept,2))
plt.annotate(line_eq,(5,15),fontsize=15,color="red")
plt.title("Linear Regression of Distance Vs. Average of Price (Uber)")
plt.xlabel("Distance(m)")
plt.ylabel("Average of Price($)")
plt.show()

In [39]:
#Study the relationship between price and the distance
# Calculate the correlation coefficient and a linear regression model 
# for price and distance for the entire rides based on different app


lyft_rides=final_df.loc[final_df["Rideshare_App"] =="Lyft"]
lyft_rides = lyft_rides.groupby("dist").mean()
lyft_rides = lyft_rides.dropna()

# Count the number of Lyft rides within each distance range

lyft_price = lyft_rides["Price($)"]
lyft_distance = lyft_rides["Distance(m)"]

corr = round(st.pearsonr(lyft_distance, lyft_price )[0],2)
print(f"The correlation between price of Lyft rides and the Distance  {corr}")


#linear regression model
linear_reg = st.linregress( lyft_rides["Distance(m)"],lyft_rides["Price($)"])
linear_reg



In [40]:
# plot the regression line on the scatter plot
lyft_rides=final_df.loc[final_df["Rideshare_App"] =="Lyft"]


plt.scatter(lyft_distance, lyft_price)
#linear regression equation and plot the line
lyft_price_predict = lyft_distance * linear_reg.slope + linear_reg.intercept
plt.plot(lyft_distance, lyft_price_predict, color='red')
line_eq = "y = " + str(round(linear_reg.slope,2)) + "x + " + str(round(linear_reg.intercept,2))
plt.annotate(line_eq,(3.5,20),fontsize=15,color="red")
plt.title("Linear Regression of Distance Vs. Average of Price (Lyft)")
plt.xlabel("Distance(m)")
plt.ylabel("Average of Price($)")
plt.show()


Maryam's Analysis:
Lyft charges more for based rides compare to Uber (10.05 vs. 9.50) and charges more for each additional miles compare to Uber (3.38 vs. 1.93) 

# Comparison of Distance vs. Price for Each ride 
1- The problem: How do we know if Distance would effect the price charges on each app?
2- The solution: ANOVA - does Distance not effect the price that each ride charges?

In [41]:
#Create BoxPlot to compare mean of the distance vs Price
#Uber
uber_rides.boxplot("Distance(m)", by="Price($)", figsize=(20, 10))

In [42]:
# Extract individual groups based on different price 
group0 = uber_rides[uber_rides["Price($)"] <= 30]["Distance(m)"]
group1 = uber_rides[uber_rides["Price($)"] <= 60]["Distance(m)"]
group2 = uber_rides[uber_rides["Price($)"] <= 90]["Distance(m)"]


In [43]:
# Perform the ANOVA
st.f_oneway(group0, group1, group2)

In [44]:
#Create BoxPlot to compare mean of the distance vs Price
#Lyft 
lyft_rides.boxplot("Distance(m)", by="Price($)", figsize=(20, 10))

In [45]:
# Extract individual groups based on different price 
group0 = lyft_rides[lyft_rides["Price($)"] <= 30]["Distance(m)"]
group1 = lyft_rides[lyft_rides["Price($)"] <= 60]["Distance(m)"]
group2 = lyft_rides[lyft_rides["Price($)"] <= 90]["Distance(m)"]


In [46]:
# Perform the ANOVA
st.f_oneway(group0, group1, group2)

### Maryam's Analysis:
#### Uber : 
Uber includes more rides, and has the p-Value of pvalue=1.1480829994832444e-127 which is a very small number. So, the null hypothesis that Distance would not have effect on the Price can be rejected. 
#### Lyft : 
Lyft includes less rides, and had the pvalue=4.858273627090331e-168 which will also reject the null hypohesis that Distance would not have an affect on Price. 