In [13]:
import pandas as pd
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Step 1: Load the dataset
file_path = '/mnt/data/Daily_Public_Transport_Passenger_Journeys_by_Service_Type_20241128.csv'
data = pd.read_csv("C:\Users\DELL\Downloads\Cleaned_Transport_Data.csv")

# Step 2: Data Cleaning
# Convert "Date" to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y', errors='coerce')
data = data.dropna(subset=['Date'])  # Drop rows with invalid dates

# Replace commas in numeric columns and convert to float
numeric_columns = ['Local Route', 'Light Rail', 'Peak Service', 'Rapid Route', 'School', 'Other']
for col in numeric_columns:
    if data[col].dtype == 'object':
        data[col] = data[col].str.replace(',', '')
    data[col] = data[col].astype(float)

# Handle missing values in "Other" column by imputing the mean
imputer = SimpleImputer(strategy='mean')
data['Other'] = imputer.fit_transform(data[['Other']])

# Save cleaned data
cleaned_file_path = '/mnt/data/Cleaned_Transport_Data.csv'
data.to_csv(cleaned_file_path, index=False)

# Step 3: Key Insights
# Total passengers by service type
total_passengers_by_service = data[numeric_columns].sum()

# Monthly trends
data['Month'] = data['Date'].dt.to_period('M')
monthly_trends = data.groupby('Month')[numeric_columns].sum()

# Peak and least usage days
peak_day = data.loc[data[numeric_columns].sum(axis=1).idxmax()]
least_day = data.loc[data[numeric_columns].sum(axis=1).idxmin()]

# Visualization: Total passengers by service type
plt.figure(figsize=(12, 6))
sns.barplot(x=total_passengers_by_service.index, y=total_passengers_by_service.values)
plt.title("Total Passengers by Service Type")
plt.ylabel("Passenger Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Step 4: Forecasting
forecast_results = {}
forecast_period = 7

# Forecast for each service type using Exponential Smoothing
for service in numeric_columns:
    # Prepare the data for forecasting
    ts_data = data[['Date', service]].dropna().set_index('Date').resample('D').sum()
    ts_data = ts_data.asfreq('D').fillna(0)  # Ensure no gaps in the time series

    if len(ts_data) >= 365:  # Ensure there are at least 365 days of data
        # Apply the Exponential Smoothing model
        model = ExponentialSmoothing(ts_data[service], seasonal='add', seasonal_periods=365)
        model_fit = model.fit(optimized=True)
        forecast = model_fit.forecast(forecast_period)

        # Save results
        forecast_results[service] = forecast
    else:
        print(f"Not enough data for seasonal forecasting for {service}")

# Combine forecasts into a single DataFrame
forecast_df = pd.DataFrame(forecast_results)
forecast_df.index = forecast.index

# Save forecast results to a CSV file
forecast_file_path = '/mnt/data/Forecast_Transport_Data.csv'
forecast_df.to_csv(forecast_file_path)

# Output file paths for the cleaned and forecasted data
print("Cleaned Data File Path:", cleaned_file_path)
print("Forecast Data File Path:", forecast_file_path)

# Display first few rows of forecasted data
print(forecast_df.head())


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1727928693.py, line 9)