In [None]:
import os
import mlflow
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import sweetviz as sv
from scipy import stats
import warnings
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# Set MLflow tracking URI
mlflow.set_tracking_uri("https://mlflow-nhsanv7hcq-uc-a.run.app")

# The credentials created by gcloud auth application-default login can be put in a link below
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/.config/application_default_credentials.json'

# Set env details
os.environ['MLFLOW_TRACKING_USERNAME'] = 'username'
os.environ['GOOGLE_CLOUD_PROJECT'] = 'pot-test-environment'

In [None]:
# Print token, copy and paste in the env variable in the next cell
!gcloud auth print-identity-token

# The output of gcloud auth print-identity-token is the key here
os.environ['MLFLOW_TRACKING_TOKEN'] = "tokennumber"

In [None]:
## Load data

# Define paths
#input_folder = '/content/drive/My Drive/Software tests/folder'

# Load training data # change to your path
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")

# Remove rows with all missing values from training and testing data, in case any
training_data.dropna(axis=0, how='all', inplace=True)
testing_data.dropna(axis=0, how='all', inplace=True)

print("Training data:", training_data.shape)

In [None]:
# Drop rows with missing target values

# Check and drop rows with missing target variable in the training set
training_data = training_data.dropna(subset=['r1_iram1622'])
print("New training data:", training_data.shape)

In [None]:
# Remove Features based on completion thresholds

# Create feature threshold benchmark
feature_missing_threshold = 0.8

# Check for missing values in the training dataset
missing_values = training_data.isna().sum() / training_data.count()

print("Features with missing values more than threshold: ", missing_values[missing_values > feature_missing_threshold])

# Create a list of features without missing values
features_without_missing_values = missing_values[missing_values < feature_missing_threshold].index.tolist()

# Print the list of features without missing values
print("Features without missing values converted to list for INDEX", features_without_missing_values)

# Take subset of data with updated feature list
training_data = training_data[features_without_missing_values]

In [None]:
# Impute missing data for Features

# Calculate the median from the training data
median_values = training_data.median()

# Fill missing values in the training data with the median
training_data = training_data.fillna(median_values)
print("New training data:", training_data.shape)

# Fill missing values in the testing data with the same median values
testing_data = testing_data.fillna(median_values)

In [None]:
# Outlier Removal

# Calculate z-scores for the target column
z_scores = stats.zscore(training_data['r1_iram1622'])

# Define a threshold for outlier detection (Z-score > 3 or < -3)
threshold = 2

# Filter out rows where Z-score exceeds the threshold
training_data_no_outliers = training_data[(z_scores < threshold) & (z_scores > -threshold)]

print("Outliers:\n\n", training_data[(z_scores >= threshold) | (z_scores <= -threshold)])

In [None]:
# Remove Correlated Features

# Remove highly correlated features
# Set threshold
corr_threshold = 0.9

# Calculate the correlation matrix
corr_matrix = training_data_no_outliers.drop(['r1_iram1622'], axis=1).corr().abs()

# Create a mask to identify highly correlated features
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
highly_correlated = [column for column in upper_triangle.columns if any(upper_triangle[column] > corr_threshold)]

# Drop highly correlated features
training_data_no_outliers.drop(highly_correlated, axis=1, inplace=True)
features_without_missing_values = [feature for feature in features_without_missing_values if feature not in highly_correlated]

print("Features Dropped: ", highly_correlated)

In [None]:
# Split Data

# Split training data into train and test

# Define the feature columns (excluding 'r1_iram1622' which is the target)
feature_columns = features_without_missing_values.copy()

# Split the data into features (X) and the target (y)
y = training_data_no_outliers['r1_iram1622']

# Remove 'r1_iram1622' from the list of feature columns
feature_columns.remove('28CSA_MPA_AVG')

X = training_data_no_outliers[feature_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=112)

# If you want to use a time-based split
# Sort the data based on 'time'
training_data_sorted = training_data_no_outliers
print(training_data_sorted)

# Extract the last 20 samples for testing


In [None]:
# Split the data into training and testing sets

# If you want to use a time-based split
training_data_sorted = training_data_no_outliers

# Extract the last 20 samples for testing
testing_data = training_data_sorted.iloc[-20:]
X_test = testing_data.drop(['r1_iram1622'], axis=1)
y_test = testing_data['r1_iram1622']

# Extract the remaining data for training
training_data = training_data_sorted.iloc[:-20]
X_train = training_data.drop(['r1_iram1622'], axis=1)
y_train = training_data['r1_iram1622']

In [None]:
# Normalize the data

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Transform the testing data using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# Prune Tail of Features for Reducing Overfitting

# Remove features of very low importance
# Train RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

threshold_feature_imp = 0.0001

# Get feature importance
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)
low_importance_features = feature_importance[feature_importance < threshold_feature_imp].index.tolist()

print("Total number of features: ", len(feature_importance))
print("Count of low importance features removed: ", len(low_importance_features))

# Remove features with low importance
X_train.drop(low_importance_features, axis=1, inplace=True)
X_test.drop(low_importance_features, axis=1, inplace=True)

In [None]:
# Setup MLflow Run
EXPERIMENT_NAME = "CEMQ1_Challenge"
mlflow_client = MlflowClient()

# Create an MLflow experiment, if not already exists
experiment_details = mlflow_client.get_experiment_by_name(EXPERIMENT_NAME)
if experiment_details is not None:
    print(f"Experiment: {EXPERIMENT_NAME} - already Exists")
    experiment_id = experiment_details.experiment_id
else:
    print(f"Creating New Experiment: {EXPERIMENT_NAME}")
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Filter Git-related warnings
warnings.filterwarnings("ignore", message="Failed to import Git", category=UserWarning)

# Start an MLflow experiment run
print(f"Beginning MLflow Run CemQ Grid Search for Experiment: {EXPERIMENT_NAME}")

with mlflow.start_run(experiment_id=experiment_id, run_name="CemQ Grid Search") as run:
    # Initialize models
    models = {
        "RandomForest": (
            RandomForestRegressor(random_state=42),
            {
                "n_estimators": [100, 200, 300, 400],
                "max_depth": [5, 10, 15],
            },
        ),
        "GradientBoosting": (
            GradientBoostingRegressor(random_state=42),
            {
                "n_estimators": [50, 100, 150, 200],
                "learning_rate": [0.1, 0.01],
            },
        ),
        "LinearRegression": (
            LinearRegression(),
            {},
        ),
        "SVM": (
            #SVC(),
            SVR(),
            {
                "C": [1, 10],
                "kernel": ["linear", "rbf"],
            },
        ),
        "Ridge" : (Ridge(random_state=42), {"alpha": [0.1, 1.0, 10.0]}),
        "Ridge" : (Lasso(random_state=42), {"alpha": [0.1, 1.0, 10.0]})
    }
    # Train, predict, and calculate MAE for each model
best_model_name = None
best_model = None
best_mae = float('inf')

# Perform hyperparameter tuning for each model
print("Beginning Grid Search ....\n\n")

for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_absolute_error", cv=5)
    
    # Fit the model on the training data
    grid_search.fit(X_train, y_train)
    
    # Make predictions on the test data using the best model
    top_model = grid_search.best_estimator_
    y_pred = top_model.predict(X_test)
    
    # Calculate MAE for the model
    mae = mean_absolute_error(y_test, y_pred)
    
    # Log MAE and best hyperparameters for each model in MLflow
    mlflow.log_metric(f"{model_name}_test_MAE", mae)
    mlflow.log_param(f"{model_name}_best_params", grid_search.best_params_)
    print("Model Name:", model_name, "; Test MAE:", mae, "; Best Params:", grid_search.best_params_)

    # Choose the best model based on the lowest MAE
    if mae < best_mae:
        best_mae = mae
        best_model_name = model_name
        best_model = grid_search.best_estimator_

    print("Completion of Grid Search ....\n\n")
    print("Best Model Name:", best_model_name)
    print("Best Model:", best_model)
    print("Best Model Test MAE:", best_mae)

    # Log the best model name and its MAE in MLflow
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_metric("best_model_test_MAE", best_mae)

    # Generate predictions using the best model
    test_predictions = best_model.predict(X_test)
    train_predictions = best_model.predict(X_train)

    # Evaluate the regression model
    test_mae = mean_absolute_error(y_test, test_predictions)
    train_mae = mean_absolute_error(y_train, train_predictions)
    # r2 = r2_score(y_test, data_predict_y)  # Uncomment if r2_score is needed

    # Log metrics in MLflow
    mlflow.log_metric("MAE_test", test_mae)
    mlflow.log_metric("MAE_train", train_mae)
    # mlflow.log_metric("validation_r2_score", r2)  # Uncomment if r2_score is logged

    # Log the model in MLflow
    mlflow.sklearn.log_model(best_model, artifact_path="model")

# Log run information
run_id = run.info.run_id
print("Run ID: {}".format(run_id))
# Register the model in MLflow
model_uri = "runs:/{}/model".format(run_id)
mv = mlflow.register_model(model_uri, "CemQ28Challenge_Model")
print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))

# Load the registered model as a PyFuncModel
logged_model = "runs:/{}/artifacts/model".format(run_id)
loaded_model = mlflow.pyfunc.load_model(model_uri=logged_model)


In [None]:
## Test Moving Average Method as Base MAE

# Start an MLFlow experiment run
with mlflow.start_run(experiment_id=experiment_id, run_name="CemQ Moving Average - Base Model") as run:
    # Define the moving average function
    def moving_average_forecast(data, window_size):
        forecast = []
        for i in range(len(data) - window_size + 1):
            window = data[i:i + window_size]
            forecast.append(sum(window) / window_size)
        return forecast

    # Define the window size for the moving average
    window_size = 8  # Adjust and test with multiple values

    # Calculate moving average forecast on the training set
    train_forecast = moving_average_forecast(y_train, window_size)

    # Calculate moving average forecast on the test set
    test_forecast = moving_average_forecast(y_test, window_size)

    # Compute MAE for the moving average forecast on the training set
    train_mae = mean_absolute_error(y_train[window_size - 1:], train_forecast)

    # Compute MAE for the moving average forecast on the test set
    test_mae = mean_absolute_error(y_test[window_size - 1:], test_forecast)
    # Print the Moving Average Train and Test MAE
    print("Moving Average Train MAE:", train_mae)
    print("Moving Average Test MAE:", test_mae)

    # Log the experiment metrics to MLFlow
    mlflow.log_metric("MAE_test", test_mae)
    mlflow.log_metric("MAE_train", train_mae)

In [None]:
# Suppress warnings
warnings.filterwarnings("ignore")

# Start an MLFlow experiment run
with mlflow.start_run(experiment_id=experiment_id, run_name="CemQ - ARIMA") as run:
    # Create a sample time series DataFrame
    dates = pd.date_range(start='2023-01-01', periods=len(y_train), freq='D')
    train_data = pd.DataFrame({'Date': dates, 'Value': y_train})
    train_data.set_index('Date', inplace=True)

    # Define the ARIMA model
    p, d, q = 0, 1, 4
    arima_model = ARIMA(train_data['Value'], order=(p, d, q))

    # Fit the ARIMA model
    arima_model_fit = arima_model.fit()

    # Forecast using the ARIMA model on the training set
    arima_train_forecast = arima_model_fit.predict(start=0, end=len(y_train) - 1)

    # Forecast using the ARIMA model on the test set
    forecast_steps = len(y_test)
    arima_test_forecast = arima_model_fit.forecast(steps=forecast_steps)
    # Calculate MAE for the ARIMA forecast on the test and train sets
    train_mae = mean_absolute_error(y_train, arima_train_forecast)
    test_mae = mean_absolute_error(y_test, arima_test_forecast)

    # Print the ARIMA Train and Test MAE
    print("ARIMA Test MAE:", test_mae)
    print("ARIMA Train MAE:", train_mae)

    # Log the experiment results to MLFlow
    mlflow.log_metric("MAE_test", test_mae)
    mlflow.log_metric("MAE_train", train_mae)
    mlflow.log_param("best_model", "ARIMA")

In [None]:
#Linear regression with RFE
# Start an MLFlow experiment run
with mlflow.start_run(experiment_id=experiment_id, run_name="CemQ - RFE") as run:
    # Try linear regression with recursive feature elimination
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LinearRegression

    # Initialize a Linear Regression model
    model = LinearRegression()

    # Initialize RFE with the Linear Regression model
    # Set the number of features to select (n_features_to_select) as per your requirement
    rfe = RFE(model, n_features_to_select=20)  # Adjust the number of features as needed

    # Fit RFE to your training data X_train and target variable y_train
    rfe = rfe.fit(X_train, y_train)

    # Print the selected features
    selected_features = X_train.columns[rfe.support_]
    print("Selected Features:", selected_features)

    # Train a Linear Regression model using the selected features
    model.fit(X_train[selected_features], y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test[selected_features])
    y_train_pred = model.predict(X_train[selected_features])

    # Calculate the performance metric (e.g., mean_absolute_error)
    test_mae = mean_absolute_error(y_test, y_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    print("Training MAE with selected features:", train_mae)
    print("Test MAE with selected features:", test_mae)
    # Log Experiment to MLflow
    mlflow.log_metric("MAE_test", test_mae)
    mlflow.log_metric("MAE_train", train_mae)
    mlflow.log_param("best_model", "Recursive Feature Elimination")

In [None]:
# Create Model with RFE Based Features Selection
selected_features = selected_features.copy()

# Keep only selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [None]:
with mlflow.start_run(experiment_id=experiment_id, run_name="RFE Feature - Grid Search") as run:
    # Initialize models
    models = {
        "RandomForest": (
            RandomForestRegressor(random_state=42),
            {
                "n_estimators": [100, 200, 300, 400],
                "max_depth": [5, 10, 15],
            },
        ),
        "GradientBoosting": (
            GradientBoostingRegressor(random_state=42),
            {
                "n_estimators": [50, 100, 150, 200],
                "learning_rate": [0.1, 0.01],
            },
        ),
        "LinearRegression": (
            LinearRegression(),
            {},
        ),
        "SVM": (
            SVR(),
            {
                "C": [1, 10],
                "kernel": ["linear", "rbf"],
            },
        ),
        "Ridge" : (Ridge(random_state=42), {"alpha": [0.1, 1.0, 10.0]}),
        "Ridge" : (Lasso(random_state=42), {"alpha": [0.1, 1.0, 10.0]})
    }
best_model_name = None
best_model = None
best_mae = float('inf')

# Perform hyperparameter tuning for each model
print("Beginning Grid Search ....\n\n")

for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_absolute_error", cv=5)
    
    # Fit the model on the training data
    grid_search.fit(X_train, y_train)
    
    # Make predictions on the test data using the best model
    top_model = grid_search.best_estimator_
    y_pred = top_model.predict(X_test)
    
    # Calculate MAE for the model
    mae = mean_absolute_error(y_test, y_pred)
    
    # Log MAE and best hyperparameters for each model in MLflow
    mlflow.log_metric(f"{model_name}_test_MAE", mae)
    mlflow.log_param(f"{model_name}_best_params", grid_search.best_params_)
    print("Model Name:", model_name, "; Test MAE:", mae, "; Best Params:", grid_search.best_params_)

    # Choose the best model based on the lowest MAE
    if mae < best_mae:
        best_mae = mae
        best_model_name = model_name
        best_model = grid_search.best_estimator_

    print("Completion of Grid Search ....\n\n")
    print("Best Model Name:", best_model_name)
    print("Best Model:", best_model)
    print("Best Model Test MAE:", best_mae)

    # Log the best model name and its MAE in MLflow
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_metric("best_model_test_MAE", best_mae)

    # Generate predictions using the best model
    test_predictions = best_model.predict(X_test)
    train_predictions = best_model.predict(X_train)

    # Evaluate the regression model
    test_mae = mean_absolute_error(y_test, test_predictions)
    train_mae = mean_absolute_error(y_train, train_predictions)
    # r2 = r2_score(y_test, data_predict_y)  # Uncomment if r2_score is needed

    # Log metrics in MLflow
    mlflow.log_metric("MAE_test", test_mae)
    mlflow.log_metric("MAE_train", train_mae)
    # mlflow.log_metric("validation_r2_score", r2)  # Uncomment if r2_score is logged

    # Log the model in MLflow
    mlflow.sklearn.log_model(best_model, artifact_path="model")

# Log run information
run_id = run.info.run_id
print("Run ID: {}".format(run_id))
# Register the model in MLflow
model_uri = "runs:/{}/model".format(run_id)
mv = mlflow.register_model(model_uri, "CemQ280Challenge_Model")
print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))

# Load the registered model as a PyFuncModel
#logged_model = "runs:/{}/artifacts/model".format(run_id)
#loaded_model = mlflow.pyfunc.load_model(model_uri=logged_model)


In [None]:
#Blend models to reduce overfitting in gridsearch

with mlflow.start_run(experiment_id=experiment_id, run_name="Blended Models + Grid Search") as run:
    # Initialize models with their hyperparameters for Grid Search
    models = {
        "RandomForest": (
            RandomForestRegressor(random_state=42),
            {"n_estimators": [100, 200, 300, 400], "max_depth": [5, 10, 15]}
        ),
        "GradientBoosting": (
            GradientBoostingRegressor(random_state=42),
            {"n_estimators": [50, 100, 150, 200], "learning_rate": [0.1, 0.01]}
        ),
        "LinearRegression": (LinearRegression(), {}),
        "SVM": (
            SVR(),
            {"C": [1, 10], "kernel": ["linear", "rbf"]}
        ),
        "Ridge": (
            Ridge(random_state=42),
            {"alpha": [0.1, 1.0, 10.0]}
        ),
        "Lasso": (
            Lasso(random_state=42),
            {"alpha": [0.1, 1.0, 10.0]}
        )
    }
    # Initialize variables to store predictions on train and test sets
    train_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])
    for model_name, (model, param_grid) in models.items():
    grid_search.fit(X_train, y_train)

    top_model = grid_search.best_estimator_
    y_pred_test = top_model.predict(X_test)
    y_pred_train = top_model.predict(X_train)

    test_predictions += y_pred_test
    train_predictions += y_pred_train

    mae_test = mean_absolute_error(y_test, y_pred_test)
    mae_train = mean_absolute_error(y_train, y_pred_train)

# Average predictions across all models
test_predictions /= len(models)
train_predictions /= len(models)

# Calculate MAE
test_mae = mean_absolute_error(y_test, test_predictions)
train_mae = mean_absolute_error(y_train, train_predictions)

# Print results
print("Training MAE with Blended model:", train_mae)
print("Test MAE with Blended model:", test_mae)

# Log to MLflow
mlflow.log_metric("MAE_test", test_mae)
mlflow.log_metric("MAE_train", train_mae)
mlflow.log_param("best_model", "Blended model")

In [None]:
test = np.array([1, 5, 10], dtype=float)
test /= 3
print(test)

In [None]:
# Compute Base Model on Test Data

# Start an MLflow experiment run
with mlflow.start_run(experiment_id=experiment_id, run_name="Cem Test Data - Base Model") as run:
    # Define the moving average function
    def moving_average_forecast(data, window_size):
        forecast = []
        for i in range(len(data) - window_size + 1):
            window = data[i:i + window_size]
            forecast.append(sum(window) / window_size)
        return forecast
    # Drop rows with missing or non-convertible values
    training_data = training_data.dropna(subset=["r1_iram1622"])

    # Calculate moving average forecast on the cleaned training set
    train_forecast = moving_average_forecast(training_data["r1_iram1622"], window_size)

    # Apply the same preprocessing steps to the testing data if necessary

    # Calculate moving average forecast on the test set using its available features
    # Assuming "28CSA_MPA_AVG" column is not present in the testing data
    test_forecast = moving_average_forecast(train_forecast, window_size=8)
    print(test_forecast)

    # Compute MAE for the moving average forecast on the training set
    train_mae = mean_absolute_error(training_data["r1_iram1622"][window_size - 1:], train_forecast)

    # No need to compute MAE for the test set since it's a forecasted column

    # Print the Moving Average Train MAE
    print("Moving Average Train MAE:", train_mae)

    # Log Experiment to MLflow
    mlflow.log_metric("MAE_train", train_mae)
    mlflow.log_param("best_model", "Moving Average")