In [25]:
import pandas as pd

In [26]:
df_fact = pd.read_csv('Fact_TrafficViolation.csv')
df_dim_date = pd.read_csv('Dim_Date.csv')
df_external = pd.read_csv('External_Bicycle.csv')

In [27]:
df_dim_date['ViolationDate'] = pd.to_datetime(df_dim_date['Date']).dt.date

df_fact_dated = pd.merge(
    df_fact,
    df_dim_date[['DateKey', 'ViolationDate']],
    on='DateKey',
    how='left'
)

In [30]:
df_external['starttime_dt'] = pd.to_datetime(df_external['starttime'])
df_external['BicycleDate'] = df_external['starttime_dt'].dt.date

df_external['Customer_Flag'] = (df_external['usertype'] == 'Customer').astype(int)
df_external['Subscriber_Flag'] = (df_external['usertype'] == 'Subscriber').astype(int)

df_external['Male_Flag'] = (df_external['gender'] == 'Male').astype(int)
df_external['Female_Flag'] = (df_external['gender'] == 'Female').astype(int)

# Group by date and calculate all required aggregations
df_bicycle_agg = df_external.groupby('BicycleDate').agg(
    UserType_Customer_Count=('Customer_Flag', 'sum'),
    UserType_Subscriber_Count=('Subscriber_Flag', 'sum'),
    Gender_Male_Count=('Male_Flag', 'sum'),
    Gender_Female_Count=('Female_Flag', 'sum'),
    # Sum of trip duration by date
    TotalTripDuration=('tripduration', 'sum')
).reset_index()

In [31]:
df_integrated_final = pd.merge(
    df_fact_dated,
    df_bicycle_agg,
    left_on='ViolationDate',
    right_on='BicycleDate',
    how='left'
)

count_cols = [
    'UserType_Customer_Count',
    'UserType_Subscriber_Count',
    'Gender_Male_Count',
    'Gender_Female_Count',
    'TotalTripDuration'
]

In [32]:
# Replace NaN values (where no bicycle data exists) with 0
df_integrated_final[count_cols] = df_integrated_final[count_cols].fillna(0)

In [33]:
df_integrated_final.head(1)

Unnamed: 0,DateKey,WeatherKey,AddressKey,RedLightViolations,SpeedingViolations,TotalViolations,CreateTimeStamp,SourceSystemCode,SourceFolder,ViolationDate,BicycleDate,UserType_Customer_Count,UserType_Subscriber_Count,Gender_Male_Count,Gender_Female_Count,TotalTripDuration
0,795,795,241,15,0,15,2025-06-27 21:18:40.430000000,2,2017-09-30,2016-09-02,2016-09-02,7.0,12086.0,8987.0,3106.0,144573.516667
