In [None]:
import numpy as np
import pandas as pd

# Avoid truncating DataFrame previews
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read cleaned data sets
bike_data = pd.read_csv('data/target/cleaned_bike_data.csv')
weather_data = pd.read_csv('data/target/cleaned_weather_data.csv')

# Check record length for both before manipulating
original_len = len(bike_data), len(weather_data)

In [None]:
# Cast strings (from CSV) back to datetime 
bike_data['start_time'] = pd.to_datetime(bike_data['start_time'])
bike_data['end_time'] = pd.to_datetime(bike_data['end_time'])
weather_data['start_time'] = pd.to_datetime(weather_data['start_time'])
weather_data['end_time'] = pd.to_datetime(weather_data['end_time'])

# Check types
print(bike_data.dtypes, weather_data.dtypes)

In [None]:
# Create a boolean mask to filter bike rentals that started during a weather event
start_mask = bike_data['start_time'].apply(lambda x: any(
    (x >= weather_data['start_time']) & (x <= weather_data['end_time'])
))

# Filter the bike_rentals DataFrame using the mask
rentals_started_during_bad_weather = bike_data[start_mask]
rentals_started_during_good_weather = bike_data[-start_mask]

# Get the number of rows
weather_len = rentals_started_during_bad_weather.shape[0]
non_weather_len = rentals_started_during_good_weather.shape[0]

print(f"{weather_len} rentals started during wet weather; {non_weather_len} started during dry weather")

rentals_started_during_bad_weather.head(20)

In [None]:
# Create a boolean mask to filter bike rentals that ended during a weather event
end_mask = bike_data['end_time'].apply(lambda x: any(
    (x >= weather_data['start_time']) & (x <= weather_data['end_time'])
))

# Filter the bike_rentals DataFrame using the mask
rentals_ended_during_bad_weather = bike_data[end_mask]
rentals_ended_during_good_weather = bike_data[-end_mask]

# Get the number of rows
weather_len = rentals_ended_during_bad_weather.shape[0]
non_weather_len = rentals_ended_during_good_weather.shape[0]

print(f"{weather_len} rentals ended during wet weather; {non_weather_len} ended during dry weather")

rentals_ended_during_bad_weather.head(20)

In [None]:
# Create a boolean mask to filter bike rentals that ended during a weather event but had not started in one
end_weather_bad_start_weather_good_mask = end_mask & ~start_mask

# Filter the bike_rentals DataFrame using the combined mask
rentals_ended_in_bad_weather_having_started_in_good = bike_data[end_weather_bad_start_weather_good_mask]

# Get the number of rows
rained_off_rentals = rentals_ended_in_bad_weather_having_started_in_good.shape[0]

print(f"{rained_off_rentals} rentals ended during wet weather, having started in dry weather")

rentals_ended_in_bad_weather_having_started_in_good.head(50)

In [None]:
# Check how many of the rentals that ended in bad weather also started in bad weather
ended_in_weather_started_in_weather_mask = end_mask & start_mask

# Filter the bike_rentals DataFrame using the combined mask
bike_rentals_ended_during_weather_having_started = bike_data[ended_in_weather_started_in_weather_mask]

# Get the number of rows
wet_rentals = bike_rentals_ended_during_weather_having_started.shape[0]

print(f"{wet_rentals} rentals ended during wet weather, having started in wet weather")

bike_rentals_ended_during_weather_having_started.head(50)

In [None]:
# Group the data by generation

# Define the generation categories
generations = {
    'Baby Boomer Generation': (1946, 1964),
    'Generation X': (1965, 1979),
    'Millennials': (1980, 1994),
    'Generation Z': (1995, 2012)
}

# Remove non-subscribers
subscriber_bike_data = bike_data[bike_data['user_type'] == 'Subscriber'].copy()

# Create a new column 'generation' based on birth_year
def get_generation(birth_year):
    for gen, years in generations.items():
        if years[0] <= birth_year <= years[1]:
            return gen
    return 'Unknown'

subscriber_bike_data['generation'] = subscriber_bike_data['birth_year'].apply(get_generation)

generation_groups = subscriber_bike_data.groupby('generation')


In [None]:
# Number of Subscriber rentals by generation
group_sizes = generation_groups.size()
print(group_sizes)

In [None]:
# Average length of a Subscriber rental by generation
avg_trip_duration = generation_groups['trip_duration'].mean()
print(avg_trip_duration)

In [None]:
# Percentage of Subscriber rentals, subdivided by duration in each age group

# Define the duration categories
duration_bins = [0, 25, 35, 45, np.inf]
duration_labels = ['Up to 25 min', '25-35 min', '35-45 min', '45 min and over']

# Bin the 'trip_duration' column into the defined categories
subscriber_bike_data['duration_category'] = pd.cut(subscriber_bike_data['trip_duration'], bins=duration_bins, labels=duration_labels, include_lowest=True)

# Calculate the percentage of rentals in each duration category for each generation group
duration_percentages = (
    subscriber_bike_data
    .groupby(['generation', 'duration_category'])
    .size() / subscriber_bike_data.groupby('generation').size() * 100
)

# Print the result for the first set of bin labels
print("Results for the first set of bin labels:")
print(duration_percentages.unstack(level=1))


In [None]:
# # Repeat with bins adjusted
duration_bins = [0, 5, 15, 25, np.inf]
duration_labels = ['Up to 5 min', '5-15 min', '15-25 min', '25 min and over']

# Bin the 'trip_duration' column into the defined categories
subscriber_bike_data['duration_category'] = pd.cut(subscriber_bike_data['trip_duration'], bins=duration_bins, labels=duration_labels, include_lowest=True)

# Calculate the percentage of rentals in each duration category for each generation group
duration_percentages = (
    subscriber_bike_data
    .groupby(['generation', 'duration_category'])
    .size() / subscriber_bike_data.groupby('generation').size() * 100
)

# Print the result for the first set of bin labels
print("Results for the first set of bin labels:")
print(duration_percentages.unstack(level=1))

In [None]:
# Number of rentals by user_type

user_type_groups = bike_data.groupby('user_type')
print(user_type_groups.size())

bike_data.head(50)

In [None]:
# Average length of rental by user_type
avg_trip_duration = user_type_groups['trip_duration'].mean()
print(avg_trip_duration)



In [None]:
# An analysis of which kiosks are most frequently used
from_station_groups = bike_data.groupby('from_station_name')
print(from_station_groups.size())

# Analysis of departure kiosk popularity, grouped by generation, then station
station_generation_groups = subscriber_bike_data.groupby(['generation', 'from_station_name'])
print(station_generation_groups.size())

# Analysis of arrival kiosk popularity, grouped by station, then generation
station_generation_groups = subscriber_bike_data.groupby(['to_station_name', 'generation'])
print(station_generation_groups.size())

# Recommendations on which kiosks and age categories should be chosen to trial targeted advertisements. 
# A list of recommendations on how the data quality can be improved, and any other information or insights you think is relevant. 

In [None]:
# Most popular destinations
to_station_groups = bike_data.groupby('to_station_name')
print(to_station_groups.size())

In [None]:
original_len_bike, original_len_weather = original_len
print(f"{original_len_bike - len(bike_data)} bike records and {original_len_weather - len(weather_data)} weather records have been removed")