In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
import h3
import geopandas as gpd
import geojson

TODO: seasonal eff, taxi speed, count dropoff/pickup,pricepermile outlier? idle time

In [None]:
df = pd.read_csv(
    "../data/analytics_trip_data.csv",
    index_col=None,
    dtype={
        'payment_type': 'category',
        'company': 'category'
    },
    usecols={
        'taxi_id',
        'trip_start_timestamp',
        'trip_end_timestamp',
        'trip_seconds',
        'trip_miles',
        'pickup_census_tract',
        'dropoff_census_tract',
        'pickup_community_area',
        'dropoff_community_area',
        'fare',
        'tips',
        'tolls',
        'extras',
        'trip_total',
        'payment_type',
        'company',
        'pickup_centroid_location',
        'dropoff_centroid_location',
        'h3_res_4_pickup',
        'h3_res_4_dropoff',
        'h3_res_6_pickup',
        'h3_res_6_dropoff',
        'h3_res_8_pickup',
        'h3_res_8_dropoff',
        'temperature',
        "precipitation"
    },         
    parse_dates=['trip_start_timestamp', 'trip_end_timestamp'])

In [None]:
pd.set_option('display.max_columns', None)
df.describe()

In [None]:
df.head()

## Choropleth

In [None]:
def create_choropleth(dataframe,h3geocol,datacol,legend_name):
    grouped_data = dataframe.groupby(dataframe[h3geocol])[datacol].mean().reset_index()
    m = folium.Map(location=[41.8781, -87.6298], zoom_start=10)
    
    geojson_data = {
        'type': 'FeatureCollection',
        'features': []
    }
    
    for index, row in grouped_data.iterrows():
        geometry = {'type': 'Polygon', 'coordinates': [h3.h3_to_geo_boundary(row[h3geocol], geo_json=True)]}
        feature = {'type': 'Feature', 'geometry': geometry, 'properties': {f'{h3geocol}': row[h3geocol],f'{datacol}': row[datacol]}}
        geojson_data['features'].append(feature)
        
    cp = folium.Choropleth(
        geo_data=geojson_data,
        data=grouped_data,
        columns=[h3geocol, datacol],
        key_on=f'feature.properties.{h3geocol}',
        fill_color='YlGnBu',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name=legend_name
    ).add_to(m)

    folium.GeoJsonTooltip([h3geocol,datacol]).add_to(cp.geojson)
    
    return m

def create_choropleth_size(dataframe, h3geocol, legend_name):
    # Count the number of rides per H3 hexagon
    grouped_data = dataframe.groupby(h3geocol).size().reset_index(name='total_rides')
    
    # Create a base map
    m = folium.Map(location=[41.8781, -87.6298], zoom_start=10)
    
    # Create GeoJSON data
    geojson_data = {
        'type': 'FeatureCollection',
        'features': []
    }
    
    # Generate GeoJSON features for each H3 hexagon
    for index, row in grouped_data.iterrows():
        geometry = {
            'type': 'Polygon',
            'coordinates': [h3.h3_to_geo_boundary(row[h3geocol], geo_json=True)]
        }
        feature = {
            'type': 'Feature',
            'geometry': geometry,
            'properties': {
                f'{h3geocol}': row[h3geocol],
                'total_rides': row['total_rides']
            }
        }
        geojson_data['features'].append(feature)
    
    # Create the choropleth map
    folium.Choropleth(
        geo_data=geojson_data,
        data=grouped_data,
        columns=[h3geocol, 'total_rides'],  # Ensure columns are correct
        key_on=f'feature.properties.{h3geocol}',
        fill_color='YlGnBu',
        fill_opacity=0.7,
        line_opacity=0,  # Remove outlines
        legend_name=legend_name
    ).add_to(m)
    
    # Add tooltips to display H3 ID and number of rides
    folium.GeoJsonTooltip(
        fields=[h3geocol, 'total_rides'],
        aliases=['H3 ID:', legend_name]
    ).add_to(folium.GeoJson(geojson_data).add_to(m))
    
    return m

In [None]:
create_choropleth_size(df,"h3_res_8_dropoff","Average Total trip seconds")

In [None]:
create_choropleth(df,"h3_res_8_pickup","trip_seconds","Average Total trip seconds")

In [None]:
create_choropleth(df,"h3_res_8_dropoff","price_per_mile","Average price per mile")

In [None]:
create_choropleth(df,"h3_res_8_pickup","tips","Average tips")

## Heatmap

In [None]:
tempdf = df.copy()

In [None]:
hourly = tempdf.groupby(df["trip_middle_timestamp"].dt.hour)["trip_seconds"].mean().reset_index()
weekdaily = tempdf.groupby(df["trip_middle_timestamp"].dt.weekday)["trip_seconds"].mean().reset_index()

In [None]:
# Neue Spalten für Stunde und Wochentag hinzufügen
tempdf['hour'] = df['trip_middle_timestamp'].dt.hour
tempdf['weekday'] = df['trip_middle_timestamp'].dt.day_name()

In [None]:
#NUMBER OF RIDES PER HOUR
pivot_table_trip_count = tempdf.pivot_table(index='weekday', columns='hour', values='trip_seconds', aggfunc='size')
pivot_table_trip_count = pivot_table_trip_count.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

# Plot the heatmap
sns.heatmap(pivot_table_trip_count, cmap='viridis')

In [None]:
pivot_table_trip_seconds = tempdf.pivot_table(index='weekday', columns='hour', values='trip_seconds', aggfunc='mean')
pivot_table_trip_seconds = pivot_table_trip_seconds.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
sns.heatmap(pivot_table_trip_seconds, cmap='viridis') #trip seconds

In [None]:
pivot_table_count = tempdf.pivot_table(index='weekday', columns='hour',values="tips",aggfunc='mean')
pivot_table_count = pivot_table_count.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
sns.heatmap(pivot_table_count, cmap='viridis') #trip count

## Idle time

In [None]:
df_sorted = df.sort_values(by=['taxi_id', 'trip_start_timestamp'])
df_sorted['idle_time'] = df_sorted.groupby('taxi_id')['trip_start_timestamp'].shift(-1) - df_sorted['trip_end_timestamp']
idledf = df_sorted.groupby('taxi_id')['idle_time'].mean().reset_index()
idledf

In [None]:
print(idledf.dtypes)

## Correlation

In [None]:
corrdf = df.copy()

In [None]:
corrdf['date_hour'] = corrdf['trip_middle_timestamp'].dt.floor('h')
hourly_rides = corrdf.groupby('date_hour').size().reset_index(name='num_rides')
hourly_precipitation = corrdf.groupby('date_hour')['precipitation'].mean().reset_index()
hourly_temperature = corrdf.groupby('date_hour')['temperature'].mean().reset_index()
hourly_price_per_mile = corrdf.groupby('date_hour')['price_per_mile'].mean().reset_index()
hourly_trip_total = corrdf.groupby('date_hour')['trip_total'].mean().reset_index()
# Merge the DataFrames
hourly_data = pd.merge(hourly_rides, hourly_precipitation, on='date_hour')
hourly_data = pd.merge(hourly_data, hourly_temperature, on='date_hour')
hourly_data = pd.merge(hourly_data, hourly_price_per_mile, on='date_hour')
hourly_data = pd.merge(hourly_data, hourly_trip_total, on='date_hour')

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='precipitation', y='num_rides', data=hourly_data, alpha=0.5)
sns.regplot(x='precipitation', y='num_rides', data=hourly_data, scatter=False, color='red')

plt.title('Hourly Taxi Rides vs. Precipitation')
plt.xlabel('Average Hourly Precipitation')
plt.ylabel('Number of Taxi Rides')
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# Extract temperature and number of rides
X = hourly_data['temperature'].values.reshape(-1, 1)
y = hourly_data['num_rides'].values

# Create polynomial features
poly = PolynomialFeatures(degree=3)  
X_poly = poly.fit_transform(X)

# Fit the polynomial regression model
model = LinearRegression()
model.fit(X_poly, y)

# Predict values
y_poly_pred = model.predict(X_poly)

# Add predictions to the DataFrame for plotting
hourly_data['num_rides_poly_pred'] = y_poly_pred

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='temperature', y='num_rides', data=hourly_data, alpha=0.5, label='Data')

# Sort the data for plotting the line
sorted_data = hourly_data.sort_values('temperature')
plt.plot(sorted_data['temperature'], sorted_data['num_rides_poly_pred'], color='red', label='Polynomial Fit')

plt.title('Hourly Taxi Rides vs. Temperature with Polynomial Fit')
plt.xlabel('Average Hourly Temperature')
plt.ylabel('Number of Taxi Rides')
plt.legend()
plt.show()

In [None]:
# Extract hour from date_hour
hourly_data['hour'] = hourly_data['date_hour'].dt.hour

# Create subplots
fig, ax1 = plt.subplots(figsize=(14, 7))

# Plot number of rides
color = 'tab:blue'
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Number of Rides', color=color)
sns.lineplot(x='hour', y='num_rides', data=hourly_data, ax=ax1, color=color, label='Number of Rides')
ax1.tick_params(axis='y', labelcolor=color)

# Pplot price per mile
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Price per Mile', color=color)
sns.lineplot(x='hour', y='price_per_mile', data=hourly_data, ax=ax2, color=color, label='Price per Mile')
ax2.tick_params(axis='y', labelcolor=color)

# Add title and show plot
plt.title('24-Hour Visualization of Number of Rides and Price per Mile')
plt.show()

## grouped taxi/company

In [None]:
grouped_taxi_id = df.groupby('taxi_id').agg({
    'miles_per_hour': 'mean',
    'price_per_mile': 'mean',
    'taxi_id': 'size',
    'company': 'first'
}).rename(columns={'taxi_id': 'drive_count'})

grouped_company = df.groupby('company').agg({
    'miles_per_hour': 'mean',
    'price_per_mile': 'mean',
    'taxi_id': 'size'
}).rename(columns={'taxi_id': 'drive_count'})

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))

sns.scatterplot(data=grouped_taxi_id, x='drive_count', y='price_per_mile', ax=axes[0, 0])
axes[0, 0].set_title('Correlation between Number of Drives and Price per Mile (Taxi ID)')
axes[0, 0].set_xlabel('Number of Drives')
axes[0, 0].set_ylabel('Price Per Mile')

sns.scatterplot(data=grouped_taxi_id, x='drive_count', y='miles_per_hour', ax=axes[0, 1])
axes[0, 1].set_title('Correlation between Number of Drives and Average Speed (Taxi ID)')
axes[0, 1].set_xlabel('Number of Drives')
axes[0, 1].set_ylabel('Miles Per Hour')

sns.scatterplot(data=grouped_company, x='drive_count', y='price_per_mile', ax=axes[1, 0])
axes[1, 0].set_title('Correlation between Number of Drives and Price per Mile (Company)')
axes[1, 0].set_xlabel('Number of Drives')
axes[1, 0].set_ylabel('Price Per Mile')

sns.scatterplot(data=grouped_company, x='drive_count', y='miles_per_hour', ax=axes[1, 1])
axes[1, 1].set_title('Correlation between Number of Drives and Average Speed (Company)')
axes[1, 1].set_xlabel('Number of Drives')
axes[1, 1].set_ylabel('Miles Per Hour')

plt.tight_layout()
plt.show()



In [None]:
df[df["miles_per_hour"]>80]

In [None]:
Q1 = df['price_per_mile'].quantile(0.25)
Q3 = df['price_per_mile'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['price_per_mile'] < lower_bound) | (df['price_per_mile'] > upper_bound)]

## number of rides (time)

In [None]:
df['year_daily'] = df['trip_middle_timestamp'].dt.to_period('D')
daily_counts = df['year_daily'].value_counts().sort_index()
daily_counts_rolling = daily_counts.rolling(window=7).mean()

plt.figure(figsize=(12, 6))
daily_counts.plot(label='Daily Counts', alpha=0.15)
daily_counts_rolling.plot(label='Rolling Average (1 Week)', linestyle='--')
plt.title('Number of Rides with Rolling Average')
plt.xlabel('Day')
plt.ylabel('Number of Rides')
plt.legend()
plt.grid(True)
plt.show()