In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.ar_model import AutoReg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.stattools import adfuller 
import warnings 
warnings.filterwarnings('ignore')


# Load dataset
dataset = pd.read_csv(r"B:\Project\ML_ForeCasting_AISPRAY\Data Set\Operational_Bus_data - Operational_Bus_data.csv")

# Preprocess Date column
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d-%m-%Y', errors='coerce')
dataset = dataset.sort_values(by='Date')

# Aggregate the data by 'Date' and 'Bus Route No.'
agg_df = dataset.groupby(['Date', 'Bus Route No.']).agg({
    'Trips per Day': 'sum',
    'Bus Stops Covered': 'sum',
    'Frequency (mins)': 'mean',
    'Distance Travelled (km)': 'sum',
    'Time (mins)': 'sum',
    'Tickets Sold': 'sum',
    'Revenue Generated (INR)': 'sum',
    'From': 'first',
    'To': 'first',
    'Way': 'first',
    'Main Station': 'first'
}).reset_index()

# Set Date as index for easy slicing later
agg_df.set_index('Date', inplace=True)

# Create a date range for the forecast period
date_range = pd.date_range(start='01-01-2022', end='14-01-2025', freq='D')

# Initialize a dictionary to store forecast results for each bus route
forecasts = {}

# Loop through each unique Bus Route No.
for route in agg_df['Bus Route No.'].unique():
    print(f"Forecasting for Bus Route: {route}")
    
    # Filter the data for this specific route
    route_data = agg_df[agg_df['Bus Route No.'] == route]
    
    # Ensure data is aligned with the full date range
    route_data = route_data.reindex(date_range)
    
    # Interpolate missing numerical data and forward-fill categorical data
    numerical = route_data.select_dtypes(exclude='object').columns
    categorical = route_data.select_dtypes(include='object').columns
    
    # Interpolate numerical data
    route_data[numerical] = route_data[numerical].interpolate(method='linear')
    # Forward fill categorical data
    route_data[categorical] = route_data[categorical].ffill()
    
    # Check if there are any remaining missing values
    if route_data.isnull().sum().any():
        route_data = route_data.fillna(method='ffill')  # Or use other methods depending on the data

    # Train-Test Split (last 30 days for testing)
    train = route_data['Tickets Sold'].iloc[:-30]
    test = route_data['Tickets Sold'].iloc[-30:]
    
    # Check for missing or infinite values in the training data before fitting the model
    if train.isnull().any() or np.isinf(train).any():
        train = train.fillna(method='ffill')  # Or use other imputation methods
        train = train.replace([np.inf, -np.inf], np.nan).dropna()  # Replace inf values with NaN and drop them
    
    # Perform ADF test to check for stationarity
    def adf_test(series):
        result = adfuller(series)
        p_value = result[1]
        if p_value < 0.5:
            pass 
        elif p_value > 0.5:
            print('Data is not stationary (Fail to reject H0)')
            series = series.diff().dropna()  # Differencing if non-stationary
        else:
            print('Data is not stationary (Fail to Reject H0)') 
        return series
    
    train = adf_test(train)  # Apply ADF test and differencing if needed
    
########## Fit ARIMA model####################################################################################
    ## (p=4, d=0, q=0 as observed)
    p, d, q = 4, 1, 1
    model = ARIMA(train, order=(p, d, q))
    model_fit = model.fit()
    
########## Fit SARIMA model ###################################################################################
    ##(p=4, d=1, q=1) and seasonal_order (P=1, D=1, Q=1, m=7 for weekly seasonality)##
    p, d, q = 4, 1, 1
    P, D, Q, m = 1, 1, 1, 12  # Seasonal parameters, m=7 for weekly seasonality
    model = SARIMAX(train, order=(p, d, q), seasonal_order=(P, D, Q, m))
    model_fit = model.fit(disp=False)
    
########### Fit AutoReg model for AR###########################################################################
    model = AutoReg(train, lags=4)
    model_fit = model.fit()
    
###############################################################################################################    
    

    # Predict the next 30 days
    predictions = model_fit.forecast(steps=len(test))
    
    # Evaluate the model using MAPE and RMSE
    mape = np.mean(np.abs((np.array(test) - np.array(predictions)) / np.array(test))) * 100
    rmse = np.sqrt(mean_squared_error(test, predictions))
    
    print(f"MAPE for Bus Route {route}: {mape}")
    print(f"RMSE for Bus Route {route}: {rmse}")
    
    # Store the forecasts for later comparison
    forecasts[route] = {
        'actual': test,
        'predicted': predictions,
        'mape': mape,
        'rmse': rmse
    }
    
    # Plot the actual vs predicted values
    plt.figure(figsize=(10, 5))
    plt.plot(test.index, test, label='Actual')
    plt.plot(test.index, predictions, label='Predicted')
    plt.title(f'Bus Route {route} - Actual vs Predicted')
    plt.legend()
    plt.show() 


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
dataset = pd.read_csv(r"B:\Project\ML_ForeCasting_AISPRAY\Data Set\Operational_Bus_data - Operational_Bus_data.csv")

# Standardize 'Way' column (optional)
dataset['Way'] = dataset['Way'].replace({
    'One-way': 'One Way',
    'Round-trip': 'Round Trip',
    'One Way': 'One Way',
    'Round Trip': 'Round Trip'
})

# Convert 'Date' column to datetime format
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d-%m-%Y', errors='coerce')

# Aggregate data by Date, Bus Route No., Way, and Main Station
agg_df = dataset.groupby(['Date', 'Bus Route No.', 'Way', 'Main Station']).agg({
    'Tickets Sold': 'sum'
}).reset_index()

# Interpolate missing values for numerical columns (if any)
agg_df['Tickets Sold'] = agg_df['Tickets Sold'].interpolate(method='linear')

# Encode categorical variables ('Way' and 'Main Station')
agg_df['Way'] = pd.Categorical(agg_df['Way']).codes
agg_df['Main Station'] = LabelEncoder().fit_transform(agg_df['Main Station'])

# Prepare to calculate MAPE, MAE, MSE for each bus route and way
routes_ways = agg_df[['Bus Route No.', 'Way', 'Main Station']].drop_duplicates()

# Results dictionary for storing MAPE, MAE, and MSE for each route and way combination
ar_results = {}

# Loop through each bus route, way, and main station combination
for _, row in routes_ways.iterrows():
    bus_route = row['Bus Route No.']
    way = row['Way']
    main_station = row['Main Station']
    
    # Filter the data for the current bus route, way, and main station
    route_data = agg_df[(agg_df['Bus Route No.'] == bus_route) & (agg_df['Way'] == way) & (agg_df['Main Station'] == main_station)].dropna()

    # Prepare features and target variable (using all columns except 'Date' and 'Tickets Sold')
    X = route_data.drop(columns=['Tickets Sold', 'Date', 'Bus Route No.'])
    y = route_data['Tickets Sold']
    
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
################### Train Decision Tree model #################################################################
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)
    
################### Train Linear Regression model##############################################################
    model = LinearRegression()
    model.fit(X_train, y_train)
 
###############################################################################################################
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate MAPE, MAE, MSE
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mape = np.mean(np.abs((np.array(y_test) - np.array(y_pred)) / (np.array(y_test) + 1e-10))) * 100  # Avoid division by zero
    
    # Store results in the dictionary
    ar_results[(bus_route, way, main_station)] = {'mape': mape, 'mae': mae, 'mse': mse}
    print(f'Bus Route No. {bus_route} with Way {way} and Main Station {main_station}, MAPE: {mape:.2f}%')

# Print the results for all the bus routes, ways, and main stations
print("\nResults (MAPE, MAE, MSE) for each Bus Route, Way, and Main Station:")
for (bus_route, way, main_station), result in ar_results.items():
    print(f"Bus Route No. {bus_route} with Way {way} and Main Station {main_station}:")
    print(f"MAPE: {result['mape']:.2f}%")
    print(f"MAE: {result['mae']:.2f}")
    print(f"MSE: {result['mse']:.2f}")
    print("-" * 50)

# You can optionally plot the predictions vs actual for any specific bus route/way
# Example: Plot for the first entry in ar_results

bus_route, way, main_station = list(ar_results.keys())[0]  # Get the first bus route/way/main station
route_data = agg_df[(agg_df['Bus Route No.'] == bus_route) & (agg_df['Way'] == way) & (agg_df['Main Station'] == main_station)]

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(route_data['Date'], route_data['Tickets Sold'], label="Actual", color='blue')
plt.title(f"Ticket Sales Forecast for Bus Route {bus_route} - {way} - Main Station {main_station}")
plt.xlabel('Date')
plt.ylabel('Tickets Sold')
plt.legend()
plt.show()