In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

# Avoid truncating DataFrame previews
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read cleaned data sets
bike_data = pd.read_csv('data/target/cleaned_bike_data.csv')
weather_data = pd.read_csv('data/target/cleaned_weather_data.csv')

# Check record length for both before manipulating
original_len = len(bike_data), len(weather_data)

In [None]:
# Cast strings (from CSV) back to datetime 
bike_data['start_time'] = pd.to_datetime(bike_data['start_time'])
bike_data['end_time'] = pd.to_datetime(bike_data['end_time'])
weather_data['start_time'] = pd.to_datetime(weather_data['start_time'])
weather_data['end_time'] = pd.to_datetime(weather_data['end_time'])

# Check types
print(bike_data.dtypes, weather_data.dtypes)

In [None]:
# Find the first start_time and last end_time
first_start = weather_data['start_time'].min()
last_end = weather_data['end_time'].max()

# Calculate the total duration between first start_time and last end_time
total_duration = last_end - first_start

# Calculate the duration for each record
weather_data['duration'] = weather_data['end_time'] - weather_data['start_time']

# Sum the durations for all records
sum_of_durations = weather_data['duration'].sum()

print(f" Duration of all recorded weather events: {sum_of_durations}")
print(f" Total period covered by weather events data: {total_duration}")
print(f" Bad weather events occur {sum_of_durations/total_duration:.2%} percent of the time")

In [None]:
# Calculate the lower and upper quantile values
q_low = bike_data['trip_duration'].quantile(0.01)
q_high = bike_data['trip_duration'].quantile(0.99)

# Winsorize the data
bike_data['trip_duration_winsorized'] = np.clip(bike_data['trip_duration'], q_low, q_high)

# Create a boolean mask to filter bike rentals that had any overlap with a weather event
overlap_mask = bike_data.apply(lambda row: any(
    (row['start_time'] >= weather_data['start_time']) & (row['start_time'] <= weather_data['end_time']) |
    (row['end_time'] >= weather_data['start_time']) & (row['end_time'] <= weather_data['end_time']) |
    ((row['start_time'] < weather_data['start_time']) & (row['end_time'] > weather_data['end_time']))
), axis=1)

# Filter the bike_rentals DataFrame using the mask
rentals_with_bad_weather = bike_data[overlap_mask]
rentals_with_good_weather = bike_data[~overlap_mask]

print(f" Average hire duration during GOOD weather: {rentals_with_good_weather['trip_duration_winsorized'].mean():.2f}")
print(f" Average hire duration during BAD weather: {rentals_with_bad_weather['trip_duration_winsorized'].mean():.2f}")

# Get the number of rows
weather_len = rentals_with_bad_weather.shape[0]
non_weather_len = rentals_with_good_weather.shape[0]

print(f" {weather_len} rentals had wet weather during the rental period; {non_weather_len} rentals had no wet weather")
print(f" Rentals are affected by weather {weather_len/(weather_len+non_weather_len):.2%} percent of the time")


In [None]:
# Group the data by generation

# Define the generation categories
generations = {
    'Baby Boomer Generation': (1946, 1964),
    'Generation X': (1965, 1979),
    'Millennials': (1980, 1994),
    'Generation Z': (1995, 2012)
}

# Remove non-subscribers
subscriber_bike_data = bike_data[bike_data['user_type'] == 'Subscriber'].copy()

# Create a new column 'generation' based on birth_year
def get_generation(birth_year):
    for gen, years in generations.items():
        if years[0] <= birth_year <= years[1]:
            return gen
    return 'Unknown'

subscriber_bike_data['generation'] = subscriber_bike_data['birth_year'].apply(get_generation)

generation_groups = subscriber_bike_data.groupby('generation')


In [None]:
# Number of Subscriber rentals by generation
group_sizes = generation_groups.size().sort_values(ascending=True)
print('\n Generation group sizes (Subscribers)')
print(group_sizes)

# Create a bar chart

# Create a bar chart
fig = px.bar(x=group_sizes.index, y=group_sizes.values, title='Subscriber Rentals by Generation')

# Customize the layout
fig.update_layout(
    xaxis_title='Generation',
    yaxis_title='Number of Subscriber Rentals',
    xaxis_tickangle=-45  # Rotate x-axis labels by 45 degrees
)

# Display the chart
fig.show()

In [None]:
# Average length of a Subscriber rental by generation (NOT winsorized)
avg_trip_duration_non_winsorized = generation_groups['trip_duration'].mean()
print("\n Non-winsorized average trip duration by generation:")
print(avg_trip_duration_non_winsorized)

# Average length of a Subscriber rental by generation (winsorized)
avg_trip_duration_winsorized = generation_groups['trip_duration_winsorized'].mean()
print("\n Winsorized average trip duration by generation:")
print(avg_trip_duration_winsorized)

# Group the data by birth year and calculate the mean winsorized trip duration
birth_year_groups = subscriber_bike_data.groupby('birth_year')['trip_duration_winsorized'].mean().reset_index()

# Create the scatterplot
fig = px.scatter(birth_year_groups, x='birth_year', y='trip_duration_winsorized',
                 title='Average Trip Duration by Subscriber Birth Year',
                 labels={'birth_year': 'Birth Year', 'trip_duration_winsorized': 'Average Trip Duration (Minutes)'},
                 trendline='ols')

# Show the plot
fig.show()

In [None]:
# Percentage of Subscriber rentals, subdivided by duration in each age group

# Define the duration categories
duration_bins = [0, 25, 35, 45, np.inf]
duration_labels = ['Up to 25 min', '25-35 min', '35-45 min', '45 min and over']

# Bin the 'trip_duration' column into the defined categories
subscriber_bike_data['duration_category'] = pd.cut(subscriber_bike_data['trip_duration'], bins=duration_bins, labels=duration_labels, include_lowest=True)

# Calculate the percentage of rentals in each duration category for each generation group
duration_percentages = (
    subscriber_bike_data
    .groupby(['generation', 'duration_category'], observed=True)
    .size() / subscriber_bike_data.groupby('generation').size() * 100
)

# Print the result for the first set of bin labels
print("\n Results for the first set of bin labels:")
print(duration_percentages.unstack(level=1))


In [None]:
# # Repeat with bins adjusted
duration_bins = [0, 5, 15, 25, np.inf]
duration_labels = ['Up to 5 min', '5-15 min', '15-25 min', '25 min and over']

# Bin the 'trip_duration' column into the defined categories
subscriber_bike_data['duration_category'] = pd.cut(subscriber_bike_data['trip_duration'], bins=duration_bins, labels=duration_labels, include_lowest=True)

# Calculate the percentage of rentals in each duration category for each generation group
duration_percentages = (
    subscriber_bike_data
    .groupby(['generation', 'duration_category'], observed=True)
    .size() / subscriber_bike_data.groupby('generation').size() * 100
)

# Print the result for the first set of bin labels
print("\n Results for the first set of bin labels:")
print(duration_percentages.unstack(level=1))

In [None]:
# Number of rentals by user_type
print('\n Total number of rides by user_type')
user_type_groups = bike_data.groupby('user_type')
print(user_type_groups.size())

# bike_data.head(50)

In [None]:
# Average length of rental by user_type
print('\n Average non-winsorized trip_duration by user_type')
avg_trip_duration = user_type_groups['trip_duration'].mean()
print(avg_trip_duration)

# Average length of rental by user_type
print('\n Average winsorized trip_duration by user_type')
avg_trip_duration = user_type_groups['trip_duration_winsorized'].mean()
print(avg_trip_duration)


In [None]:
# Analysis of DEPARTURE KIOSK popularity, grouped by generation, then station

# Group the data by generation and from_station_name
dept_station_generation_groups = subscriber_bike_data.groupby(['generation', 'from_station_name'])

# Create a new DataFrame with the counts
dept_station_counts = dept_station_generation_groups.size().reset_index(name='departure_count')

# Sort the DataFrame by generation and departure_count
dept_station_popularity_by_generation = dept_station_counts.sort_values(['generation', 'departure_count'], ascending=[True, False]).groupby('generation')

# Print the top stations for each generation
print("\n Top 10 DEPARTURE KIOSKS for each Generation:")
for generation, group in dept_station_popularity_by_generation:
    print(f"\n Generation: {generation}")
    print(group.nlargest(10, 'departure_count')[['from_station_name', 'departure_count']].to_string(index=False))

# Analysis of DESTINATION KIOSK popularity, grouped by generation, then station
arr_station_generation_groups = subscriber_bike_data.groupby(['generation', 'to_station_name'])
arr_station_counts = arr_station_generation_groups.size().reset_index(name='arrival_count')
arr_station_popularity_by_generation = arr_station_counts.sort_values(['generation', 'arrival_count'], ascending=[True, False]).groupby('generation')

# Print the top 5 destinations for each generation
print("\n Top 10 DESTINATION KIOSKS for each Generation:")
for gen, group in arr_station_popularity_by_generation:
    print(f"\n Generation: {gen}")
    print(group.nlargest(10, 'arrival_count')[['to_station_name', 'arrival_count']].to_string(index=False))

In [None]:
for generation, years in generations.items():
    # Filter the data for the current generation
    filtered_data = subscriber_bike_data[subscriber_bike_data['generation'] == generation]
    
    # Create the heatmap for the current generation
    fig = px.density_heatmap(filtered_data, x='generation', y='to_station_name', nbinsx=2)
    fig.update_layout(
        title=f'Destination Popularity for {generation}',
        xaxis_title='Generation',
        yaxis_title='Station'
    )
    
    # Display or save the heatmap as needed
    fig.show()

In [None]:
original_len_bike, original_len_weather = original_len
print(f"{original_len_bike - len(bike_data)} bike records and {original_len_weather - len(weather_data)} weather records have been removed")