In [5]:
# Install Streamlit/ other packss
!pip install streamlit joblib pandas numpy plotly



In [5]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define paths for train and test CSV files
train_path = "train.csv"
test_path = "test.csv"

# Load the training and test datasets
try:
    train = pd.read_csv(train_path)  # Ensure the correct path is provided
    test = pd.read_csv(test_path)  # Ensure the correct path is provided
except FileNotFoundError:
    raise FileNotFoundError("Ensure the train.csv and test.csv files are in the correct directory.")

# Identify numeric and categorical columns
numeric_columns = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoric_columns = [col for col in train.columns if col not in numeric_columns]

# Remove 'target' and 'nbr_of_transactions' from numeric columns if they exist
for column in ['target', 'nbr_of_transactions']:
    if column in numeric_columns:
        numeric_columns.remove(column)

# One-hot encode categorical  instead of BE
train_encoded = pd.get_dummies(train, columns=categoric_columns, drop_first=True)  # Drop first to avoid multicollinearity
test_encoded = pd.get_dummies(test, columns=categoric_columns, drop_first=True)

# Ensure test_encoded has the same columns as train_encoded
missing_cols = set(train_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0  # Add missing columns with a default value of 0

# Ensure columns are in the same order
test_encoded = test_encoded[train_encoded.columns.drop(['target'])]

# Standardize numeric columns
scaler = StandardScaler()
scaler.set_output(transform="pandas")  # To ensure output is a DataFrame
scaler.fit(train_encoded[numeric_columns])

# Scale training and test datasets
scaled_train = scaler.transform(train_encoded[numeric_columns])
scaled_test = scaler.transform(test_encoded[numeric_columns])

# Concatenate scaled numeric columns with encoded categorical columns and target for training data
train_combined = pd.concat([scaled_train, train_encoded.drop(numeric_columns + ['target'], axis=1), train_encoded['target']], axis=1)

# Concatenate scaled numeric columns with encoded categorical columns for test data
test_combined = pd.concat([scaled_test, test_encoded.drop(numeric_columns, axis=1)], axis=1)

# Split datasets into features and target variable
X_train = train_combined.drop(['target'], axis=1)
y_train = train_combined['target']

X_test = test_combined  # Test set should not contain 'target'

# Initialize models for training and evaluation
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

# Train and evaluate each model
model_results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Store the results in a list for DataFrame conversion
    model_results.append({"Model": name, "MSE": mse, "MAE": mae})

# Convert results to a DataFrame for display
results_df = pd.DataFrame(model_results)

# Define folder path for saving models
folder_path = "model_files"  # Set a relative path for storing model files

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save models to specified folder
for name, model in models.items():
    joblib.dump(model, os.path.join(folder_path, f"{name}_model.joblib"))

# Save the scaler for future use
joblib.dump(scaler, os.path.join(folder_path, "scaler.joblib"))

# Check feature importances for models that support it
feature_importances = []

for name, model in models.items():
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        feature_importances.append({"Model": name, "Importances": importances.tolist()})

# Convert to a DataFrame for visualization
feature_importances_df = pd.DataFrame(feature_importances)

# Display results
results_df, feature_importances_df  # These DataFrames contain the performance metrics and feature importances


  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with a default value of 0
  test_encoded[col] = 0  # Add missing columns with

In [4]:
!pip install category_encoders

Collecting category_encoders
  Obtaining dependency information for category_encoders from https://files.pythonhosted.org/packages/7f/e5/79a62e5c9c9ddbfa9ff5222240d408c1eeea4e38741a0dc8343edc7ef1ec/category_encoders-2.6.3-py2.py3-none-any.whl.metadata
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 81.9/81.9 kB 183.6 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3
