In [1]:
import pandas as pd

In [2]:
df_fact = pd.read_csv('Fact_TrafficViolation.csv')
df_dim_date = pd.read_csv('Dim_Date.csv')
df_external = pd.read_csv('External_Bicycle.csv')

In [3]:
df_dim_date['ViolationDate'] = pd.to_datetime(df_dim_date['Date']).dt.date

df_fact_dated = pd.merge(
    df_fact,
    df_dim_date[['DateKey', 'ViolationDate']],
    on='DateKey',
    how='left'
)

In [4]:
df_external['starttime_dt'] = pd.to_datetime(df_external['starttime'])
df_external['BicycleDate'] = df_external['starttime_dt'].dt.date

df_external['Customer_Flag'] = (df_external['usertype'] == 'Customer').astype(int)
df_external['Subscriber_Flag'] = (df_external['usertype'] == 'Subscriber').astype(int)

df_external['Male_Flag'] = (df_external['gender'] == 'Male').astype(int)
df_external['Female_Flag'] = (df_external['gender'] == 'Female').astype(int)

# Group by date and calculate all required aggregations
df_bicycle_agg = df_external.groupby('BicycleDate').agg(
    UserType_Customer_Count=('Customer_Flag', 'sum'),
    UserType_Subscriber_Count=('Subscriber_Flag', 'sum'),
    Gender_Male_Count=('Male_Flag', 'sum'),
    Gender_Female_Count=('Female_Flag', 'sum'),
    # Sum of trip duration by date
    TotalTripDuration=('tripduration', 'sum')
).reset_index()

In [5]:
df_integrated_final = pd.merge(
    df_fact_dated,
    df_bicycle_agg,
    left_on='ViolationDate',
    right_on='BicycleDate',
    how='left'
)

count_cols = [
    'UserType_Customer_Count',
    'UserType_Subscriber_Count',
    'Gender_Male_Count',
    'Gender_Female_Count',
    'TotalTripDuration'
]

In [6]:
# Replace NaN values (where no bicycle data exists) with 0
na_records = df_integrated_final[df_integrated_final[count_cols].isna().any(axis=1)]
print(na_records)

        DateKey  WeatherKey  AddressKey  RedLightViolations  \
572         116         116         122                  31   
2956        116         116         297                   0   
3192        116         116         160                   1   
3633        116         116         333                   0   
5956        116         116         263                   1   
...         ...         ...         ...                 ...   
322300      116         116         352                   0   
323069      116         116         150                  10   
324497      116         116          96                   1   
324671      116         116         202                  12   
324690      116         116         281                   4   

        SpeedingViolations  TotalViolations                CreateTimeStamp  \
572                      0               31  2025-06-27 21:18:40.430000000   
2956                     4                4  2025-06-27 21:18:40.430000000   
3192     

In [7]:
columns_to_keep = list(df_fact.columns) + count_cols
df_final_output = df_integrated_final[columns_to_keep].copy()

output_filename = 'Integrated_Weather_Bicycle.csv'
df_final_output.to_csv(output_filename, index=False)

In [23]:
df_integrated = pd.read_csv('Integrated_Weather_Bicycle.csv')
df_integrated[pd.to_datetime(df_integrated['SourceFolder']).dt.year == 2016].count()

DateKey                      0
WeatherKey                   0
AddressKey                   0
RedLightViolations           0
SpeedingViolations           0
TotalViolations              0
CreateTimeStamp              0
SourceSystemCode             0
SourceFolder                 0
UserType_Customer_Count      0
UserType_Subscriber_Count    0
Gender_Male_Count            0
Gender_Female_Count          0
TotalTripDuration            0
dtype: int64