In [None]:
#Part 1 - Declare dependencies, read the files into dataframes, evaluate the data

# Add Matplotlib inline magic command
%matplotlib inline
#Declare dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statistics as statistics

# Import your data into a Pandas DataFrame.
# Store filepaths in variables
city_data = "Resources/city_data.csv"
ride_data = "Resources/ride_data.csv"
# Read the data files into dataframes
city_data_df = pd.read_csv(city_data)
ride_data_df = pd.read_csv(ride_data)

#Evaluate the data
# # ride_data_df.isnull().sum() #check for null values - **no nulls found
# ride_data_df.count()  #get count of columns and find out if any are null  **count is 2375
# ride_data_df.dtypes #get the data type of each column **fare = float, ride_id = int, city and date=object
# # ride_data_df["city"].unique() #shows an array of all the unique city names

# city_data_df.isnull().sum() #check for null values - **no nulls found
# city_data_df.count()  #get count of columns and find out if any are null **count is 120
#city_data_df.dtypes #get the data type of each column  **city and type are object, driver_count is int
# city_data_df["type"].unique() #shows an array of all the unique city types

In [None]:
# Example of getting count of city type in the city data frame. It's like a distinct statement
#in SQL.  Of the 120 cities, this will display the count per city type equaling 120.   
print(sum(city_data_df["type"]=="Suburban"))
print(sum(city_data_df["type"]=="Urban"))
print(sum(city_data_df["type"]=="Rural"))

In [None]:
#Part 2 Merge the dataframes, create separate dataframes by city type and save chart element arrays in series
#Here's your merge statement, how=join type, on=keys to join
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the merged DataFrame
pyber_data_df.head()

#Create Data Frames for each type of city
# Create the Urban city DataFrame.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
# urban_cities_df.head()
# Create the Rural city DataFrame.
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]
# rural_cities_df.head()
# Create the Suburban city DataFrame.
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
# suburban_cities_df.head()

#Get total nmber of rides for each city type which will go on the x-axis
#Store the number of rides per city type in series
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
print(urban_ride_count.head())
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
# suburban_ride_count.head()
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]
# rural_ride_count.head()

# Get the average fare for each type of city on the y-axis and store in series
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
# urban_avg_fare.head()
# print(urban_avg_fare)
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
# suburban_avg_fare.head()
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]
# rural_avg_fare.head()

#Get the average number of drivers for each city type and store in series
urban_driver_count = urban_cities_df.groupby(["city"]).mean()["driver_count"]
# urban_driver_count.head()
# print(urban_driver_count)
suburban_driver_count = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
# suburban_driver_count.head()
rural_driver_count = rural_cities_df.groupby(["city"]).mean()["driver_count"]
# rural_avg_fare.head()

In [None]:
# Part 3.  Create a bubble chart that showcases the average fare versus the total number of rides with bubble size based on the total
#number of drivers for each city type, including urban, suburban, and rural.

# For the bubble chart, we will need to plot the following:
# The average fare for each type of city on the y-axis
# The total number of rides for each type of city on the x-axis
# Make the size of each marker, or bubble, correlate to the average number of drivers for each type of city
# We want the bubble colors to be gold, skyblue and coral
# We need to create a scatter plot for each series individually and then we'll add them all to one chart.

#Urban cities scatter plot - it neeeds values for x-axis and y-axis, total rides and avg fare, size = 10,
#black edge color with line width of 1, color = coral and transparency = 20%
# plt.figure(figsize=(10, 10))

# Build the scatter plots for urban cities.
plt.scatter(urban_ride_count,urban_avg_fare,s=urban_driver_count*10, color="coral", alpha=.8, label= "Urban",
           edgecolor="black",linewidth=1)
#Now add a title, axes labels, a legend, change the color and size of the markers

# Build the scatter plots for suburban cities.
plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

# Build the scatter plots for rural cities.
plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

#running them one after the other will plot them all on the same figure

#Add title and axis labels and set grid property
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.ylabel("Average Fare ($)", fontsize=12)
plt.xlabel("Total Number of Rides (Per City)", fontsize=12)
plt.grid(True)
# Add the legend.
# Create a legend that will have bubbles of the same size
lgnd = plt.legend(fontsize="12", mode="Expanded",
         scatterpoints=1, loc="best", title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)
#plt.legend() - don't need this because it's called in the first line

# Incorporate a text label about circle size.
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")
# Save the figure.
plt.savefig("analysis/Fig1_Avg Fare per City Type scatter.png")

In [None]:
# Part 4. # Get overall summary statistics on the merged dataframe.
# We'll use and compare the following three ways to calculate the summary statistics:

# 4a. The Pandas describe() function on the DataFrame or Series.
# 4b. The Pandas mean(), median(), and mode() methods on a Series.
# 4c. The NumPy mean() and median() functions, and the SciPy stats mode() function on a Series.

# Here is 4a. The Pandas describe() function on the DataFrame or Series.
print(urban_cities_df.describe())
print(suburban_cities_df.describe())
print(rural_cities_df.describe())


In [None]:
# Here is 4b. use the Pandas mean(), median(), and mode() methods on a Series.
# Our task - 
# Determine the mean, median, and mode for the following using Pandas methods:
# The total number of rides for each city type.  
# The average fares for each city type.  
# The total number of drivers for each city type.


# Get the mean of the ride count for each city type
# Calculate the mean(average) of the ride count for each city type.
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)
print(f"The mean for the ride counts for urban trips is: ", round(urban_ride_count.mean(),2))

# Calculate the mean(average) of the ride count for each city type.
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)
print(f"The median for the ride counts for urban trips is: ", round(urban_ride_count.median(),2))

# Calculate the mode (most common value(s)) of the ride count for each city type.
round(urban_ride_count.mode(),2), round(suburban_ride_count.mode(),2), round(rural_ride_count.mode(),2)
print(f"The mode for the ride counts for urban trips is: ", round(urban_ride_count.mode(),2))

urban_ride_count.describe()

In [None]:
# Here is 4c. Use the NumPy mean() and median() functions, and the SciPy stats mode() function on a Series.
# Import NumPy and the stats module from SciPy.
import numpy as np
import scipy.stats as sts
# # Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for urban trips is {mean_urban_ride_count:.2f}.")
median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for urban trips is {median_urban_ride_count}.")
mode_urban_ride_count = sts.mode(urban_ride_count)
print(f"The mode for the ride counts for urban trips is {mode_urban_ride_count}.")
# #Unlike the Pandas mode() method, the sts.mode() method will return the number of times the mode appears in the dataset.

min_urban_ride_count = np.min(urban_ride_count)
print(f"The min for the ride counts for urban trips is {min_urban_ride_count:.2f}.")
max_urban_ride_count = np.max(urban_ride_count)
print(f"The max for the ride counts for urban trips is {max_urban_ride_count:.2f}.")

# Calculate the measures of central tendency for the ride count for the suburban cities.
mean_suburban_ride_count = np.mean(suburban_ride_count)
print(f"The mean for the ride counts for suburban trips is {mean_suburban_ride_count:.2f}.")
median_suburban_ride_count = np.median(suburban_ride_count)
print(f"The median for the ride counts for suburban trips is {median_suburban_ride_count}.")
mode_suburban_ride_count = sts.mode(suburban_ride_count)
print(f"The mode for the ride counts for suburban trips is {mode_suburban_ride_count}.")

# # Calculate the measures of central tendency for the ride count for the rural cities.
mean_rural_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for rural trips is {mean_rural_ride_count:.2f}.")
median_rural_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for rural trips is {median_rural_ride_count}.")
mode_rural_ride_count = sts.mode(rural_ride_count)
print(f"The mode for the ride counts for rural trips is {mode_rural_ride_count}.")

In [None]:
# Get the summary statistics for the average fare for each city type.
#We've not yet created series of fares by city type.  Do that now
urban_fares = urban_cities_df["fare"]
suburban_fares = suburban_cities_df["fare"]
rural_fares = rural_cities_df["fare"]
# print(urban_fares.head())

# Now we can calculate the mean, median, and mode for the urban_fares Series. 
# To get the mean and median, we'll use the NumPy mean and median functions; to get the mode, 
# we'll use the SciPy statistics mode function, sts.mode(). 
# Using this mode function returns how many times the mode appears in the dataset.

# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")
median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")
mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")
print(f"")
# Calculate the measures of central tendency for the average fare for the subsuburban cities.
mean_suburban_fares = np.mean(suburban_fares)
print(f"The mean fare price for suburban trips is ${mean_suburban_fares:.2f}.")
median_suburban_fares = np.median(suburban_fares)
print(f"The median fare price for suburban trips is ${median_suburban_fares:.2f}.")
mode_suburban_fares = sts.mode(suburban_fares)
print(f"The mode fare price for suburban trips is {mode_suburban_fares}.")
print(f"")
# Calculate the measures of central tendency for the average fare for the subrural cities.
mean_rural_fares = np.mean(rural_fares)
print(f"The mean fare price for rural trips is ${mean_rural_fares:.2f}.")
median_rural_fares = np.median(rural_fares)
print(f"The median fare price for rural trips is ${median_rural_fares:.2f}.")
mode_rural_fares = sts.mode(rural_fares)
print(f"The mode fare price for rural trips is {mode_rural_fares}.")

In [None]:
#Now we want to perform summary statistics on the number of drivers by city type
urban_drivers = urban_cities_df["driver_count"]
suburban_drivers = suburban_cities_df["driver_count"]
rural_drivers = rural_cities_df["driver_count"]
print(urban_drivers.head())
#This is showing me the driver count per city for 5 urban cities of the whole series of cities

# Calculate the mean, median, and mode for the urban, suburban, and rural driver count Series using NumPy and SciPy statistics module.

# I don't need to re-import these, just want to get it stuck in my memory that these are needed
import numpy as np
import scipy.stats as sts

# # Calculate the measures of central tendency for the number of drivers for the urban cities.
mean_urban_driver_count = np.mean(urban_driver_count)
print(f"The mean for the driver counts for urban trips is {mean_urban_driver_count:.2f}.")
median_urban_driver_count = np.median(urban_driver_count)
print(f"The median for the driver counts for urban trips is {median_urban_driver_count}.")
mode_urban_driver_count = sts.mode(urban_driver_count)
print(f"The mode for the driver counts for urban trips is {mode_urban_driver_count}.")
# #Unlike the Pandas mode() method, the sts.mode() method will return the number of times the mode appears in the dataset.
print(f"")
# Calculate the measures of central tendency for the number of drivers for the suburban cities.
mean_suburban_driver_count = np.mean(suburban_driver_count)
print(f"The mean for the driver counts for suburban trips is {mean_suburban_driver_count:.2f}.")
median_suburban_driver_count = np.median(suburban_driver_count)
print(f"The median for the driver counts for suburban trips is {median_suburban_driver_count}.")
mode_suburban_driver_count = sts.mode(suburban_driver_count)
print(f"The mode for the driver counts for suburban trips is {mode_suburban_driver_count}.")
print(f"")
# # Calculate the measures of central tendency for the number of drivers for the rural cities.
mean_rural_driver_count = np.mean(rural_driver_count)
print(f"The mean for the driver counts for rural trips is {mean_rural_driver_count:.2f}.")
median_rural_driver_count = np.median(rural_driver_count)
print(f"The median for the driver counts for rural trips is {median_rural_driver_count}.")
mode_rural_driver_count = sts.mode(rural_driver_count)
print(f"The mode for the driver counts for rural trips is {mode_rural_driver_count}.")

In [None]:
#Part 5 - create box and whisker plots
# Create box-and-whisker plots that visualize each of the following to determine if there are any outliers:
# 5a.The number of rides for each city type.
# 5b.The fares for each city type.
# 5c.The number of drivers for each city type.

#Here is 5a: box and whisker plots for the number of rides for each city type
# Create a box-and-whisker plot for the urban cities ride count.
x_labels = ["Urban"]
#2 lines of code to create the chart in oo code
fig, ax = plt.subplots()
ax.boxplot(urban_ride_count, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data (2019)')
ax.set_ylabel('Number of Rides')
# ax.set_yticks(np.arange(10, 41, step=2.0))
ax.grid()
plt.show()

In [None]:
# Create a box-and-whisker plot for the subsuburban cities ride count.
x_labels = ["Suburban"]
#2 lines of code to create the chart in oo code
fig, ax = plt.subplots()
ax.boxplot(suburban_ride_count, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(8, 30, step=2.0))
ax.grid()
plt.show()

In [None]:
# Create a box-and-whisker plot for the subrural cities ride count.
x_labels = ["Rural"]
#2 lines of code to create the chart in oo code
fig, ax = plt.subplots()
ax.boxplot(rural_ride_count, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(2, 14, step=2.0))
ax.grid()
plt.show()

In [None]:
### Show data for urban, surburban and rural count all on one box-and-whisker graph

# Add all ride count box-and-whisker plots to the same graph.
x_labels = ["Urban", "Suburban","Rural"]
ride_count_data = [urban_ride_count, suburban_ride_count, rural_ride_count]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(ride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0, 45, step=3.0))
ax.grid()
# Save the figure.
plt.savefig("analysis/Fig2_Rider Counts.png")
plt.show()
#the little circle is the outlier and it's the highest data point, the min and max are the little bars, 
#the median is the red line, the standard deviation is the ticks above and below the median, 
#the box upper and lower boundaries represent the upper and lower quartiles

#We can find out which city has the highest ride count this way:
# Get the city that matches 39.
urban_city_outlier = urban_ride_count[urban_ride_count==39].index[0] #i.e. return the value in the index column
#question:  what if I wanted to return a list of all the urban cities with ride count > 25
print(f"{urban_city_outlier} has the highest rider count.")

In [None]:
#Create box and whiskerplot for fares for all city types
# Add all ride count box-and-whisker plots to the same graph.
x_labels = ["Urban", "Suburban","Rural"]
ride_count_data = [urban_fares, suburban_fares, rural_fares]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Fare Data (2019)',fontsize=20)
ax.set_ylabel('Fare ($USD)',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(ride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0, 60, step=5.0))
ax.grid()
# Save the figure.
plt.savefig("analysis/Fig3_Fare Data.png")
plt.show()
#the little circle is the outlier and it's the highest data point, the min and max are the little bars, 
#the median is the red line, the standard deviation is the ticks above and below the median, 
#the box upper and lower boundaries represent the upper and lower quartiles

# #We can find out which city has the highest ride count this way:
# # Get the city that matches 39.
# urban_city_outlier = urban_ride_count[urban_ride_count==39].index[0] #i.e. return the value in the index column
# #question:  what if I wanted to return a list of all the urban cities with ride count > 25
# print(f"{urban_city_outlier} has the highest rider count.")


In [None]:
#Create box and whiskerplot for driver counts for all city types
# Add all ride count box-and-whisker plots to the same graph.
x_labels = ["Urban", "Suburban","Rural"]
ride_count_data = [urban_drivers, suburban_drivers, rural_drivers]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Driver Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Drivers)',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(ride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0, 80, step=5.0))
ax.grid()
# Save the figure.
plt.savefig("analysis/Fig4_Driver Counts.png")
plt.show()
#the little circle is the outlier and it's the highest data point, the min and max are the little bars, 
#the median is the red line, the standard deviation is the ticks above and below the median, 
#the box upper and lower boundaries represent the upper and lower quartiles

# #We can find out which city has the highest ride count this way:
# # Get the city that matches 39.
# urban_city_outlier = urban_ride_count[urban_ride_count==39].index[0] #i.e. return the value in the index column
# #question:  what if I wanted to return a list of all the urban cities with ride count > 25
# print(f"{urban_city_outlier} has the highest rider count.")


In [None]:
# Create a pie chart that visualizes each of the following data for each city type:
# The percent of total fares.
# The percent of total rides.
# The percent of total drivers.

In [None]:
###### Things to do with dataframes

###### Basic formatting
#Error bars with caps
#Line example
    #ax.plot(x_axis, y_axis, color = "green", linewidth= 2, marker = "o",label= "Boston")  
#Bar example
#Scatter example 
    ax.scatter(y_axis, x_axis, s = [i * 5 for i in y_axis], color = "skyblue", label='Boston', alpha=.8, edgecolors="k",
          linewidths=2)       
#Pie example
#Labeling and rotating
    plt.xlabel("Date")
    plt.ylabel("Fare($)")
    plt.title("PyBer Fare by Month")
    plt.legend()
#Ticking
#Error Bars with caps

 