## Luisa Johanna Kaczmzrek
## Student ID: 

### City chosen for analysis: Chicago
### Data range of historical data: January 2023 - December 2023

## Task 1: Time Series Data Prep

In [5]:
import pandas as pd
import glob
import os

ModuleNotFoundError: No module named 'pandas'

Load the data from session_05

In [4]:
def load_trip_data(filepath_pattern):
    """
    Load and concatenate monthly trip files
    Parameters:
    filepath_pattern (str): Path pattern for trip data files
    Returns:
    pd.DataFrame: All trips concatenated
    """

    # Get all CSV files matching the pattern
    csv_files = sorted(glob.glob(filepath_pattern))
    print(f"Files found: {len(csv_files)}")
    # record exact files found (name, data range, size)
    for file in csv_files:
        print(f"File: {file}")
        print(f"Size: {os.path.getsize(file)} bytes")
        print(f"Number of records: {sum(1 for line in open(file)) - 1}")  # minus header line

    # Load and concatenate all files
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df)

    # Concatenate all DataFrames
    trip_data = pd.concat(dfs, ignore_index=True)

    # Parse datetime columns
    datetime_cols = ['started_at', 'ended_at']
    for col in datetime_cols:
        if col in trip_data.columns:
            trip_data[col] = pd.to_datetime(trip_data[col])

    return trip_data

In [None]:
# Load all trip data
trip_data = load_trip_data("./data/*-divvy-tripdata.csv")

Files found: 0


ValueError: No objects to concatenate

In [None]:
df = trip_data.copy()

In [None]:
df.head()

In [None]:
# Check if index is DatetimeIndex
print(isinstance(df.index, pd.DatetimeIndex))

In [None]:
# Convert to datetime (timezone naive)
df.started_at = pd.to_datetime(df.started_at)

In [None]:
df.started_at

In [None]:
# use date only, discard time and timezone
df.index = df.started_at.dt.date
df.index.name = 'date'
df.index

In [None]:
# Sort by index
df.sort_index()
df.head()

In [None]:
# Aggregate by date
agg = df.index.value_counts().sort_index()
agg

In [None]:
# Create a dataframe from the aggregation
daily_trips = agg.to_frame('count')
type(daily_trips.index)
daily_trips.index = pd.to_datetime(daily_trips.index)
daily_trips

In [None]:
daily_trips.index

# Task 2: Decomposition

### Classical Decomposition

trends, seasonality and residuals
- additive model - add the three components together
- multiplicative model - multiple the three components

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt


decomposition = seasonal_decompose(
    daily_trips,
    model='additive', # or 'multiplicative'
    period=7, # weekly pattern
    extrapolate_trend='freq' # handles edges better
)

# extract the components
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
observed = decomposition.observed

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12, 10))

# Original time series
decomposition.observed.plot(ax=axes[0], color='black', linewidth=1)
axes[0].set_title('Original Time Series', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Trip Count', fontsize=10)
axes[0].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[0].set_xlabel('')

# Trend Component
decomposition.trend.plot(ax=axes[1], color='blue', linewidth=1)
axes[1].set_title('Trend Component', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Trend', fontsize=10)
axes[1].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[1].set_xlabel('')

# Seasonal Component (Weekly Pattern)
decomposition.seasonal.plot(ax=axes[2], color='green', linewidth=1)
axes[2].set_title('Seasonal Component (Weekly Pattern)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Seasonal', fontsize=10)
axes[2].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[2].set_xlabel('')

# Residual Component
decomposition.resid.plot(ax=axes[3], color='red', linewidth=1)
axes[3].axhline(y=0, color='grey', linestyle='--', linewidth=0.8, alpha=0.7)
axes[3].set_title('Residual Component', fontsize=12, fontweight='bold')
axes[3].set_ylabel('Residual', fontsize=10)
axes[3].set_xlabel('Date', fontsize=10)
axes[3].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.show()

Trend:

Usage goes up until around August and then starts to drop as it moves into the winter months

Seasonality:

Bikes are used much more during the week than on weekends, which suggests they are mainly used for commuting rather than for leisure or fun

Residuals:

Residuals show some outliers and a few periods with higher variability, meaning there are occasional spikes that the model doesn’t fully capture.

In [None]:
residual_var = decomposition.resid.var()
print(f"Model residual variance: {residual_var:.2f}")

#### Multiplicative

In [None]:
decomposition = seasonal_decompose(
    daily_trips,
    model='multiplicative',
    period=7, # weekly pattern
    extrapolate_trend='freq' # handles edges better
)

# extract the components
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
observed = decomposition.observed

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12, 10))

# Original Time Series
decomposition.observed.plot(ax=axes[0], color='black', linewidth=1)
axes[0].set_title('Original Time Series', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Trip Count', fontsize=10)
axes[0].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[0].set_xlabel('')

# Trend Component
decomposition.trend.plot(ax=axes[1], color='blue', linewidth=1)
axes[1].set_title('Trend Component', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Trend', fontsize=10)
axes[1].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[1].set_xlabel('')

# Seasonal Component (Weekly Pattern)
decomposition.seasonal.plot(ax=axes[2], color='green', linewidth=1)
axes[2].set_title('Seasonal Component (Weekly Pattern)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Seasonal', fontsize=10)
axes[2].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)
axes[2].set_xlabel('')

# Residual Component
decomposition.resid.plot(ax=axes[3], color='red', linewidth=1)
axes[3].axhline(y=0, color='grey', linestyle='--', linewidth=0.8, alpha=0.7)
axes[3].set_title('Residual Component', fontsize=12, fontweight='bold')
axes[3].set_ylabel('Residual', fontsize=10)
axes[3].set_xlabel('Date', fontsize=10)
axes[3].grid(True, color='lightgrey', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.show()

Trend:
    
Trip counts rise steadily from winter into summer, peak around July–August, and then gradually fall toward the end of the year.


Seasonality:
    
There’s a strong weekly pattern, with higher usage on weekdays and noticeable dips on weekends, which points to bikes being used mainly for everyday transportation rather than leisure.


Residuals:
    
Most of the leftover variation is fairly stable, but there are a few spikes and some increased noise toward the end of the year that aren’t fully explained by the trend or seasonality.

In [None]:
residual_var = decomposition.resid.var()
print(f"Model residual variance: {residual_var:.2f}")

## Task 3 - Multiple Seasonality Analysis

In [None]:
# Calculate average trips by day of the week
# add day_name element for plotting
weekday_avg = daily_trips.groupby(daily_trips.index.day_name()).mean()
day_order =['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_avg = weekday_avg.reindex(day_order)
print("Average trips by day of week:")
print(weekday_avg)

In [None]:
weekday_avg = weekday_avg['count']

In [None]:
# Plot day of week seasonality
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))

# Color code: blue for weekdays, red for weekend
colors = ['#6baed6', '#6baed6', '#6baed6', '#6baed6', '#6baed6', '#ef5350', '#ef5350']

bars = ax.bar(
    range(len(weekday_avg)),
    weekday_avg.values,
    color=colors,
    edgecolor='black',
    linewidth=1.2
)

# Set x-axis labels
ax.set_xticks(range(len(weekday_avg)))
ax.set_xticklabels(weekday_avg.index, rotation=0)

ax.set_title("Day-of-Week Seasonality Pattern", fontsize=14, fontweight='bold')
ax.set_xlabel("Day of Week", fontsize=12)
ax.set_ylabel("Average Daily Trips", fontsize=12)

# Add value labels on top of bars
for i, (bar, value) in enumerate(zip(bars, weekday_avg.values)):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        value + 100,
        f"{int(value)}",
        ha='center',
        va='bottom',
        fontsize=10,
        fontweight='bold'
    )

ax.set_ylim(0, max(weekday_avg.values) * 1.1)
plt.tight_layout()
plt.show()

The plot shows a clear day-of-week pattern. Average trip counts are relatively stable during weekdays, while usage increases on Saturdays and slightly drops again on Sundays. This confirms the presence of weekly seasonality, with higher demand toward the weekend.

In [None]:
# Calculate monthly average
monthly_avg = daily_trips.groupby(daily_trips.index.month_name()).mean()
month_order = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]
monthly_avg = monthly_avg.reindex(month_order)
print("Average trips by month:")
print(monthly_avg)

In [6]:
monthly_avg = monthly_avg['count']

NameError: name 'monthly_avg' is not defined

In [7]:
# Plot month of year seasonality
fig, ax = plt.subplots(figsize=(8, 5))

# Color scheme for months: blue for winter, lighter blue for spring, green for summer
colors = [
    '#6baed6', '#6baed6', '#81d18f',   # Jan, Feb, Mar
    '#81d18f', '#81d18f', '#81d18f',   # Apr, May, Jun
    '#f4a460', '#f4a460', '#f4a460',   # Jul, Aug, Sep
    '#6baed6', '#6baed6', '#6baed6'    # Oct, Nov, Dec
]

bars = ax.bar(
    range(len(monthly_avg)),
    monthly_avg.values,
    color=colors,
    edgecolor='black',
    linewidth=1.2
)

# Set x-axis labels
ax.set_xticks(range(len(monthly_avg)))
ax.set_xticklabels(monthly_avg.index, rotation=45, ha='right')

ax.set_title("Month-of-Year Seasonality Pattern", fontsize=14, fontweight='bold')
ax.set_xlabel("Month", fontsize=12)
ax.set_ylabel("Average Daily Trips", fontsize=12)

# Add value labels on top of bars
for i, (bar, value) in enumerate(zip(bars, monthly_avg.values)):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        value + 100,
        f"{int(value)}",
        ha='center',
        va='bottom',
        fontsize=9,
        fontweight='bold'
    )

ax.set_ylim(0, max(monthly_avg.values) * 1.15)
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

There is a strong month-of-year seasonal pattern. Trip counts are very low in winter months, increase steadily through spring, peak during summer (around July and August), and then decline again in autumn and winter. This confirms a clear annual seasonality linked to weather and seasonal conditions.