## Importing Libraries

In [751]:
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import pmdarima as pm
import tensorflow as tf
import plotly.express as px
import matplotlib.pyplot as plt
from keras.models import Sequential
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Conv1D, InputLayer
warnings.filterwarnings("ignore")

## Metric Calculation

In [752]:
def smape(actual, forecast):
    actual = np.array(actual)
    forecast = np.array(forecast)
    numerator = np.abs(forecast - actual)
    denominator = (np.abs(actual) + np.abs(forecast)) / 2
    return 100 * np.mean(numerator / denominator)

In [753]:
def printMetrics(model, feature, mse, mad, smape):
    print(f"\nMetrics for {feature} using {model}")
    print(f"\tMean Squared Error: {mse}")
    print(f"\tMean Absolute Deviation: {mad}")
    print(f"\tRoot Mean Squared Error: {np.sqrt(mse)}")
    print(f"\tSymmetric Mean Absolute Percentage: {smape}")
    saveToFile(model, feature, mse, mad, smape)

In [754]:
def saveToFile(model, feature, mse, mad, smape):
    filename = f"{model}.txt"
    with open(filename, 'a') as file:
        file.write(f"\nMetrics for {feature} using {model}\n")
        file.write(f"\tMean Squared Error: {mse}\n")
        file.write(f"\tMean Absolute Deviation: {mad}\n")
        file.write(f"\tRoot Mean Squared Error: {np.sqrt(mse)}\n")
        file.write(f"\tSymmetric Mean Absolute Percentage: {smape}\n")

## Loading the data

In [755]:
def loadData ():
    csvFiles = [os.path.join("datasets", filename) for filename in os.listdir("datasets") if filename.endswith(('.csv'))]
    data = []
    for file in csvFiles:
        frame = pd.read_csv(file, low_memory=False).drop(['index'], axis=1)
        if file == 'datasets\Mental health Depression disorder Data.csv' :
            frame = frame.loc[:6467]
            for column in frame.columns:
                if column != 'Entity' and column != 'Code':
                    frame[column] = frame[column].astype(float)
            frame['Year'] = frame['Year'].astype(int)
        if 'Code' in frame.columns:
            frame.drop(columns='Code',axis=1,inplace=True)
        if 'Country' in frame.columns:
            frame.rename(columns={'Country': 'Entity'}, inplace=True)
        data.append(frame)
    return data, csvFiles

## Data PreProcessing

In [756]:
def globalAverage(data, frame, disorder):
    averages = []
    for year in data['Year'].unique():
        averages.append(data[data['Year'] == year][disorder].mean())
    frame[disorder] = averages
    return frame

In [757]:
def yearlyAverage(data):
    frames = []   
    for dataframe in data:
        frame = pd.DataFrame()
        frame['Year'] = dataframe['Year'].unique()
        frame['Year'] = frame['Year'].astype(int)
        for feature in dataframe.columns[2:]:
            frame = globalAverage(dataframe,frame, feature)
        frames.append(frame)
    return frames

## Plotting the data

In [758]:
def plotting(df, sort_by, color_by, dpi=200):
    df.sort_values(by=sort_by, inplace=True)
    plt.figure(dpi=dpi) 
    fig = px.bar(df, x=sort_by, y="Entity", orientation='h', color=color_by)
    fig.show()

In [759]:
def plotHistogram(df, column, title, xlabel, ylabel, figsize=(10, 6), kde=True):
    plt.figure(figsize=figsize)
    sns.histplot(df[column], kde=kde)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

# Example usage
# plot_histogram(df, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 
#                'Distribution of Schizophrenia Disorder Prevalence', 'Prevalence (Age-standardized)', 'Frequency')


In [760]:

def plotLosses(trainLosses, valLosses, title='Training and Validation Losses', xlabel='Epoch', ylabel='Loss'):

    plt.plot(trainLosses, label='Train Loss')
    plt.plot(valLosses, label='Val Loss')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.show()

# Example usage
# plot_losses(train_losses, val_losses)


In [761]:
# Function to plot predicted vs actual data
def plotPredictionsVSActual(xTest, yTest, outputs, interval=(0, 39)):

    i = random.randint(*interval)

    # Extend the last point in the test sequence with the predicted and actual outcome
    predicted = np.append(xTest[i, :, 0], outputs[i])
    actual = np.append(xTest[i, :, 0], yTest[i])

    # Time points for plotting
    x = np.linspace(0, len(predicted) - 1, len(predicted))

    # Plotting the series
    plt.figure(figsize=(10, 6))
    plt.plot(x[:-1], actual[:-1], 'r-', label='Actual (History)')
    plt.plot(x[-1:], actual[-1:], 'ro', label='Actual (Latest)', markersize=10)
    plt.plot(x[:-1], predicted[:-1], 'b-', label='Predicted (History)')
    plt.plot(x[-1:], predicted[-1:], 'bo', label='Predicted (Latest)', markersize=10)

    # Identify and highlight overlapping regions in the prediction
    overlap = np.logical_and(predicted > 0, actual > 0)
    plt.plot(x[overlap], actual[overlap], 'k', label='Overlap')

    # Adding plot decorations
    plt.title('Depression Prediction and Actual Values for a Random Index')
    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

# Example usage (assuming xTest, yTest, and outputs are defined properly)
# plot_prediction_vs_actual(xTest, yTest, outputs)


In [762]:
def visualizePattern(data, feature):
    plt.figure(figsize=(10, 6))
    plt.plot(data['Year'], data[feature], marker='o')
    plt.xlabel("Year")
    plt.ylabel(feature)
    plt.title(f'Year vs {feature}')
    plt.grid(True)
    plt.show()


# for dataframe in data:
#     dataframe.info()
#     for feature in dataframe.columns[1:]:
#         visualizePattern(dataframe, feature)


In [763]:
# for dataframe in data:
#     autocorrelation_plot(dataframe)
#     plt.show()

## Forecasting

In [764]:
def movingAverage(feature, window, steps, split):
    train = feature[:split]
    actual = feature[split:]
    model = train.rolling(window=window).mean()
    forecasted = model[-len(actual):]
    performance = []
    for metric in metrics: 
        performance.append(metrics[metric](actual, forecasted))
    printMetrics("Moving Average", feature.name, performance[0],performance[1],performance[2])
    forecast = feature.rolling(window=window).mean()
    return np.array(forecast[-steps:]) 

In [765]:
def exponentialSmoothing(feature, steps, split):
    train = feature[:split]
    actual = feature[split:]
    model = ExponentialSmoothing(train, trend='add', seasonal=None).fit(smoothing_level=0.2)
    forecasted = model.forecast(steps=len(actual))
    performance = []
    for metric in metrics: 
        performance.append(metrics[metric](actual, forecasted))
    printMetrics("Holts Winter Exponential Smoothing", feature.name, performance[0],performance[1],performance[2])
    forecast = model.forecast(steps=len(actual) + steps)
    return np.array('\n'.join(map(str, forecast[len(actual):])).split('\n')).astype(float)

In [766]:
def autoRegressive(feature, lag, steps, split):
    train = feature[:split]
    actual = feature[split:]
    model = AutoReg(train, lags=lag).fit()
    forecasted =  model.predict(start=len(train)+1, end=len(train)+len(actual))
    performance = []
    for metric in metrics: 
        performance.append(metrics[metric](actual, forecasted))
    printMetrics("Auto Regression", feature.name, performance[0],performance[1],performance[2])
    forecast = model.predict(start=len(train) + len(actual) + 1, end=len(train) + len(actual) + steps)
    return np.array('\n'.join(map(str, forecast)).split('\n')).astype(float)

In [767]:
def integratedAutoRegressiveMovingAverage(feature, steps, split):
    train = feature[:split]
    actual = feature[split:]
    model = pm.auto_arima(train, seasonal=False, stepwise=True)
    forecasted = model.predict(n_periods=len(actual))
    performance = []
    for metric in metrics: 
        performance.append(metrics[metric](actual, forecasted))
    printMetrics("Auto Regressive Integerated Moving Average", feature.name, performance[0],performance[1],performance[2])
    forecast = model.predict(n_periods=steps)
    return np.array('\n'.join(map(str, forecast)).split('\n')).astype(float)
   

In [768]:
def multiLayerNeuralNetwork(neuralNetwork, data, disroder, predicted):
    years = data['Year'].dt.strftime('%Y').astype(int)
    xTrain, xVal, yTrain, yVal = train_test_split(years, data[disorder],test_size=0.2, random_state=42, shuffle=False)
    model = Sequential()
    model.add(neuralNetworks[neuralNetwork](64, activation="relu",  input_shape=(1, 1), return_sequences=True))
    model.add(neuralNetworks[neuralNetwork](50, activation="relu"))
    model.add(Dense(1, activation="linear"))
    model.compile(optimizer='adam', loss='mse')
    model.fit(xTrain, yTrain, epochs=50, verbose=1, validation_data=(xVal, yVal))
    forecasted = model.predict(xVal)
    performance = []
    for metric in metrics: 
        performance.append(metrics[metric](yVal, forecasted))
    printMetrics(neuralNetwork, disorder, performance[0],performance[1],performance[2])
    forecast = model.predict(predicted['year'].dt.strftime('%Y').astype(int))
    return forecast

## Data Splitting

In [769]:
data, dataFrameNames = loadData()
data = yearlyAverage(data)

In [770]:
metrics = {
    'MSE': mean_squared_error,
    'MAE': mean_absolute_error,
    'SMAPE': smape
}
classicalModel = {
    'Moving Average': movingAverage,
    'Exponential Smoothing': exponentialSmoothing,
    'Auto Regressive': autoRegressive,
    'Auto Regressive Integrated Moving Average': integratedAutoRegressiveMovingAverage
}
neuralNetworks ={
    'RNN': SimpleRNN,
    'LSTM': LSTM,
}
lags = [10,9,9]
window = 3
steps = 10
predictions = []
splitPoint = 0.8

In [771]:
for frame, name, lag in zip(data, dataFrameNames, lags):
    print(f"\n{name[9: -4]}")
    frame.sort_values(by='Year', ascending=True, inplace=True)
    frame['Year'] = pd.to_datetime(frame.sort_values(by='Year', ascending=True)['Year'], format='%Y')
    disorders = frame.iloc[:,1:]
    predicted = pd.DataFrame({'year': pd.date_range(start=frame['Year'].iloc[-1] + pd.DateOffset(years=1), periods=steps, freq='Y')})   
    years = pd.to_datetime(frame['Year'], format='%Y')
    split = int(len(frame['Year']) * splitPoint)
    for disorder in disorders:  
        
        feature = pd.Series(disorders[disorder].values, index=years, name=disorder)
        feature.index = feature.index.to_period('A') 
        for model in neuralNetworks:
            predicted[f"{disorder} using {model}"] = multiLayerNeuralNetwork(model, frame, feature, predicted)
        for model in classicalModel:
            
            if model == 'Auto Regressive Integrated Moving Average':
                predicted[f"{disorder} using {model}"] = classicalModel[model](feature, steps, split)
                
            elif model == 'Exponential Smoothing':
                predicted[f"{disorder} using {model}"]  = classicalModel[model](feature, steps, split)
                
            elif model == 'Moving Average':
                predicted[f"{disorder} using {model}"] = classicalModel[model](disorders[disorder], window, steps, split)
                
            elif model == 'Auto Regressive':
                predicted[f"{disorder} using {model}"] = classicalModel[model](feature, lag, steps, split)
             
    predictions.append(predicted)


1- mental-illnesses-prevalence
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1732.8698 - val_loss: 1552.1454
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1529.1559 - val_loss: 1413.6722
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1392.7343 - val_loss: 64.8696
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 63.9084 - val_loss: 459.1882
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 452.3899 - val_loss: 1032.1830
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1016.9003 - val_loss: 547.9573
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 539.8433 - val_loss: 20.2838
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 19.9841 - val_loss: 204.6261