In [None]:
# backend/api/models/regression_model.py
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score #, root_mean_squared_error
from sklearn.model_selection import train_test_split
from joblib import dump, load
import os
import json
import numpy as np
# Paths for data and model storage
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
CSV_FILE_PATH = os.path.join('data', 'all_countries_data_processed.csv')
MODEL_PATH = os.path.join('model', 'regression_model.pkl')

def load_data():
    # Load dataset from CSV
    return pd.read_csv(CSV_FILE_PATH)

def clean_dataset(dataset):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile) for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure

    # Calculate lower and upper bounds for 'Exposure Mean'
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Calculate Q1 and Q3 for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden

    # Calculate lower and upper bounds for 'Burden Mean'
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Remove outliers from both 'Exposure Mean' and 'Burden Mean'
    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & (dataset['Burden Mean'] <= upper_bound_burden)
    ]

    return dataset_cleaned

def preprocess_data(data):
    # Drop non-numeric columns that are irrelevant for training
    data = data[['Country', 'Pollutant', 'Exposure Mean', 'Burden Mean']]
    country_dict = {country: code for code, country in enumerate(data['Country'].unique())}
    data['Country'] = data['Country'].map(country_dict)
    
    with open('data/country_dict.json', 'w') as f:
        json.dump(country_dict, f)
    
    # One-hot encode 'Pollutant' and label encode 'Country'
    data = pd.get_dummies(data, columns=['Pollutant'], drop_first=False)
    
    # Separate features and target
    X = data.drop(columns=['Burden Mean'])  # All columns except 'Burden Mean' are features
    y = data['Burden Mean']  # 'Burden Mean' is the target variable

    # Normalize numeric features
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    X_scaled = scaler_X.fit_transform(X)  # Normalize features
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))  # Normalize target
    return X_scaled, y_scaled

def split_data(X, y):
    # Split into training and test sets
    return train_test_split(X, y, test_size=0.2, random_state=42)

def ai_model_linear_train(x_train, x_test, y_train, y_test):
    #Initialize the Linear Regression model
    lr = LinearRegression()

    #Train the model
    lr.fit(x_train, y_train)

    #Make predictions on the test data
    y_pred = lr.predict(x_test)

    #Evaluate the model using Mean Squared Error and R^2 score
    mse = mean_squared_error(y_test, y_pred)
    #rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return lr, mse, r2

def ai_model_polynomial_train(x_train, x_test, y_train, y_test):
    poly_features = PolynomialFeatures(degree=4)
    model = make_pipeline(poly_features, LinearRegression())
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return model, mse, r2

def save_model(model, path=MODEL_PATH):
    # Save model to disk
    dump(model, path)

def load_model(path=MODEL_PATH):
    # Load model from disk
    return load(path)[0] if os.path.exists(path) else None

def make_prediction(model, input_dict):
    with open("data/country_dict.json") as f:
        country_dict = json.load(f)

    input_dict['Country'] = country_dict.get(input_dict['Country'], None)
    pollutants = {'no2': [1, 0, 0, 0], 'ozone': [0, 1, 0, 0], 'hap': [0, 0, 1, 0], 'pm25': [0, 0, 0, 1]}
    input_dict['Pollutant'] = pollutants.get(input_dict['Pollutant'], None)

    # Concatenate 'Country', 'Exposure Mean', and 'Pollutant' into a 2D array
    features = np.array([[input_dict['Country'], input_dict['Exposure Mean']] + input_dict['Pollutant']]).astype(float)

    # Scale the features using MinMaxScaler
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(features)

    # Make predictions
    return model.predict(scaled_features)[0]


In [None]:
# Load and preprocess data
data = load_data()
data = clean_dataset(data)
X, y = preprocess_data(data)
X_train, X_test, y_train, y_test = split_data(X, y)

In [24]:
# Train linear and polynomial models
linear_model = ai_model_linear_train(X_train, X_test, y_train, y_test)
poly_model = ai_model_polynomial_train(X_train, X_test, y_train, y_test)

# Save models to disk
save_model(linear_model, path='model/linear_model.pkl')
save_model(poly_model, path='model/poly_model.pkl')

In [27]:
model = load_model('model/poly_model.pkl')
input_test = {'Country': 'Viet Nam', 'Pollutant': 'no2', 'Exposure Mean':10}
result = make_prediction(model, input_test)
print(result)

(1, 6)
[-8.11212271e+10]
