# Data Preparation for Modeling

This notebook prepares the passengers enriched data for machine learning modeling by:
1. Dropping redundant fields
2. Converting date to numerical and cyclical features
3. Creating weekend boolean field
4. Converting event type to boolean
5. Handling null values in toilet fields
6. Filling null values in service and parking fields with appropriate statistics


In [None]:

…n,

## Load the Data


In [None]:
# Load the passengers enriched data
df_original = pd.read_csv('../data/processed/passengers_enriched_2023.csv', low_memory=False)

print(f"Original dataset shape: {df_original.shape}")
print(f"\nColumn names: {df_original.columns.tolist()}")
print(f"\nData types:")
print(df_original.dtypes)


In [None]:
# Display first few rows
df_original.head()


## Create a Copy for Processing


In [None]:
# Create a copy of the dataframe for processing
df = df_original.copy()
print(f"Working dataset shape: {df.shape}")


## 1. Drop Redundant Fields


In [None]:
# Fields to drop as specified
fields_to_drop = ['airportinterchange', 'hubnaptancode', 'station_name', 'mode', 'event_name']

print(f"Fields to drop: {fields_to_drop}")
print(f"Shape before dropping: {df.shape}")

# Drop the fields
df = df.drop(columns=fields_to_drop)

print(f"Shape after dropping: {df.shape}")
print(f"Remaining columns: {df.columns.tolist()}")


## 2. Convert Date to Numerical and Cyclical Features


In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract numerical features from date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_year'] = df['date'].dt.dayofyear

# Create cyclical features for month and day of year
# Month cyclical features (12 months)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Day of year cyclical features (365/366 days)
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

print("Date features created:")
print(df[['date', 'year', 'month', 'day', 'day_of_year', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']].head())


## 3. Create Weekend Boolean Field


In [None]:
# Create is_weekend boolean field
# Weekend is Saturday (5) and Sunday (6) in pandas weekday (Monday=0)
df['is_weekend'] = df['date'].dt.weekday.isin([5, 6])

print("Weekend distribution:")
print(df['is_weekend'].value_counts())
print(f"\nPercentage of weekend days: {df['is_weekend'].mean()*100:.2f}%")

# Verify with dayofweek column
print("\nVerification with dayofweek column:")
print(pd.crosstab(df['dayofweek'], df['is_weekend']))


## 4. Convert Event Type to Boolean


In [None]:
# Check current event_type values
print("Current event_type values:")
print(df['event_type'].value_counts(dropna=False))
print(f"\nNull values in event_type: {df['event_type'].isnull().sum()}")
print(f"Total rows: {len(df)}")


In [None]:
# Create is_event boolean field
# If there is an event (not null), then 1, else 0
df['is_event'] = df['event_type'].notna().astype(int)

print("Event distribution:")
print(df['is_event'].value_counts())
print(f"\nPercentage of days with events: {df['is_event'].mean()*100:.2f}%")

# Set expected_attendance to 0 when there's no event
df.loc[df['is_event'] == 0, 'expected_attendance'] = 0

print(f"\nExpected attendance after filling zeros: {df['expected_attendance'].describe()}")


## 5. Fill Null Toilet Fields


In [None]:
# Check current toilet field values
print("Current toilet field values:")
print("\ntoilet_isaccessible:")
print(df['toilet_isaccessible'].value_counts(dropna=False))
print("\ntoilet_isfeecharged:")
print(df['toilet_isfeecharged'].value_counts(dropna=False))
print("\ntoilet_type:")
print(df['toilet_type'].value_counts(dropna=False))


In [None]:
# Fill all null toilet fields with 'unknown'
toilet_columns = ['toilet_isaccessible', 'toilet_isfeecharged', 'toilet_type']

for col in toilet_columns:
    df[col] = df[col].fillna('unknown')
    print(f"\n{col} after filling:")
    print(df[col].value_counts())


## 6. Analyze and Fill Service/Parking Fields


In [None]:
# Fields to analyze for null filling
numeric_fields = ['service_operated_allweek_pct', 'service_operated_weekday_pct', 
                 'service_operated_weekend_pct', 'kilometres_operated', 'bluebadgecarparkspaces']

print("Analysis of numeric fields with null values:")
for field in numeric_fields:
    if field in df.columns:
        null_count = df[field].isnull().sum()
        total_count = len(df)
        null_percentage = (null_count / total_count) * 100
        
        print(f"\n{field}:")
        print(f"  Null values: {null_count} ({null_percentage:.2f}%)")
        
        if null_count < total_count:  # If there are non-null values
            mean_val = df[field].mean()
            median_val = df[field].median()
            std_val = df[field].std()
            
            print(f"  Mean: {mean_val:.2f}")
            print(f"  Median: {median_val:.2f}")
            print(f"  Std: {std_val:.2f}")
            print(f"  Skewness: {df[field].skew():.2f}")
            
            # Recommend median if highly skewed, mean otherwise
            if abs(df[field].skew()) > 1:
                print(f"  Recommendation: Use MEDIAN (data is skewed)")
            else:
                print(f"  Recommendation: Use MEAN (data is relatively normal)")


In [None]:
# Visualize distributions to help decide between mean and median
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, field in enumerate(numeric_fields):
    if field in df.columns and i < len(axes):
        # Plot histogram
        df[field].hist(bins=50, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'{field} Distribution')
        axes[i].set_xlabel(field)
        axes[i].set_ylabel('Frequency')
        
        # Add mean and median lines
        mean_val = df[field].mean()
        median_val = df[field].median()
        axes[i].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
        axes[i].axvline(median_val, color='green', linestyle='--', label=f'Median: {median_val:.2f}')
        axes[i].legend()

plt.tight_layout()
plt.show()


In [None]:
# Fill null values based on analysis
# Using median for highly skewed data, mean for relatively normal data

fill_strategies = {}

for field in numeric_fields:
    if field in df.columns:
        skewness = df[field].skew()
        
        if abs(skewness) > 1:  # Highly skewed - use median
            fill_value = df[field].median()
            strategy = 'median'
        else:  # Relatively normal - use mean
            fill_value = df[field].mean()
            strategy = 'mean'
        
        fill_strategies[field] = {'value': fill_value, 'strategy': strategy}
        
        # Fill null values
        null_count_before = df[field].isnull().sum()
        df[field] = df[field].fillna(fill_value)
        null_count_after = df[field].isnull().sum()
        
        print(f"{field}: Filled {null_count_before} null values with {strategy} = {fill_value:.2f}")
        print(f"  Null values after filling: {null_count_after}")

print("\nFill strategies summary:")
for field, info in fill_strategies.items():
    print(f"{field}: {info['strategy']} = {info['value']:.2f}")


## Summary of Data Preparation


In [None]:
# Summary of changes made
print("=== DATA PREPARATION SUMMARY ===")
print(f"Original dataset shape: {df_original.shape}")
print(f"Processed dataset shape: {df.shape}")
print(f"\n1. Dropped fields: {fields_to_drop}")
print(f"\n2. Date features created:")
print(f"   - year, month, day, day_of_year")
print(f"   - month_sin, month_cos (cyclical)")
print(f"   - day_of_year_sin, day_of_year_cos (cyclical)")
print(f"\n3. Boolean features created:")
print(f"   - is_weekend: {df['is_weekend'].sum()} weekend records")
print(f"   - is_event: {df['is_event'].sum()} event records")
print(f"\n4. Toilet fields filled with 'unknown' for null values")
print(f"\n5. Numeric fields filled with appropriate statistics:")
for field, info in fill_strategies.items():
    print(f"   - {field}: {info['strategy']} = {info['value']:.2f}")

print(f"\nFinal null values check:")
null_counts = df.isnull().sum()
if null_counts.sum() == 0:
    print("✅ No null values remaining!")
else:
    print("⚠️ Remaining null values:")
    print(null_counts[null_counts > 0])


In [None]:
# Display final dataset info
print("Final dataset info:")
print(df.info())
print(f"\nFinal columns: {df.columns.tolist()}")


In [None]:
# Display first few rows of processed data
print("First few rows of processed data:")
df.head()


## Save Processed Data (Optional)


In [None]:
# Uncomment to save the processed data
# df.to_csv('../data/processed/passengers_enriched_2023_prepared.csv', index=False)
# print("Processed data saved to '../data/processed/passengers_enriched_2023_prepared.csv'")
