In [None]:
# Add Matplotlib inline magic command
%matplotlib inline

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Files to load
# Create variables to connect csv files to the resources files
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [None]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

In [None]:
# Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.head(10)

In [None]:
# Inspect city data. Check for null values, count and data type
# Get the columns and the rows that are not null.
city_data_df.count()

In [None]:
# Get the columns and the rows that are not null.
city_data_df.isnull().sum()

In [None]:
# Check data type for driver_count column is numberical as there will be mathematical calculations performed
# Get the data types of each column.
city_data_df.dtypes

In [None]:
# Check how many data points there are for each type of city
# Get the unique values of the type of city.
city_data_df["type"].unique()

In [None]:
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")

In [None]:
# Inspect the ride_data_df
# Get the columns and the rows that are not null.
ride_data_df.count()


In [None]:
# Get the columns and the rows that are not null.
ride_data_df.isnull().sum()

In [None]:
# Check data types on fare and ride_id columns, and see that they are numerical since we'll perform mathemathical calculations on these columns
# Get the data types of each column.
ride_data_df.dtypes

In [None]:
# Merge the city_data_df and the ride_data_df to create pyber_data_df
# Identify column that is shared in both, which in this case is city
# Combine the data into a single dataset
# Merge on the city, and place this to the left most, so utilize the how constraint
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the DataFrame
# The data from the ride_data_df are on the left and the city_data_df are on the right
pyber_data_df.head()

In [None]:
# Create the Urban city DataFrame.
# Utilize the pyber_data_df (recently merged df) as a filter for each type of city (Urban, Suburban, Rural)
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
urban_cities_df.head()

In [None]:
# Create the Suburban and Rural city DataFrames.
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]

rural_cities_df.head()

In [None]:
# get number of rides for each city using the city type dataframes
# Create data series with one of the columns in the dataframe using groupby() function
# Get the number of rides for urban cities.
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count.head()

In [None]:
# Create the suburban and rural ride count for riders.
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]

rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]


In [None]:
# obtain the average fare for each city in Urban, Suburban, and Rural cities
# utilize the groupby() function within each city type's dataframe
# Get average fare for each city in the urban cities.
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
urban_avg_fare.head()

In [None]:
# Get average fare for each city in the suburban and rural cities.
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]

In [None]:
# Get average number of drivers for each city in Urban, Suburban, and Rural cities
urban_driver_count = urban_cities_df.groupby(["city"]).mean()["driver_count"]
urban_driver_count.head()

In [None]:
# Get the average number of drivers for each city for the suburban and rural cities.
suburban_driver_count = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
rural_driver_count = rural_cities_df.groupby(["city"]).mean()["driver_count"]

In [None]:
# Build the scatter plots for urban cities.
plt.scatter(urban_ride_count, urban_avg_fare)

In [None]:
# Build the scatter plots for urban cities.
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=urban_driver_count)

In [None]:
# Build the scatter plots for urban cities.

#make size of the plots bigger by 10, create color as coral, add outline to plot points)
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=10*urban_driver_count, c="coral",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")

# Add labels to the plot area
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)

# Add the legend.
plt.legend()

In [None]:
# Build the scatter plots for suburban cities.
plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
# Build the scatter plots for rural cities.
plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
# Create one chart that has all city types- combine all 3 single charts from above
# Add the scatter charts for each type of city.
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=10*urban_driver_count, c="coral",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")

plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")

# Incorporate the other graph properties such as title, labels, legend
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.ylabel("Average Fare ($)", fontsize=12)
plt.xlabel("Total Number of Rides (Per City)", fontsize=12)
plt.grid(True)

# Add the legend.
# Create a legend
lgnd = plt.legend(fontsize="12", mode="Expanded",
         scatterpoints=1, loc="best", title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)

# Incorporate a text label about circle size.
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")

# Save the figure.
plt.savefig("analysis/Fig1.png")

# Show the plot
plt.show()

