# AI - Based Material Recommendation System

## Import Required Libraries

In [13]:
import pandas as pd
import numpy as np
import os
import pickle
import shutil
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, f1_score
)

from sklearn.exceptions import ConvergenceWarning
from urllib.parse import urlparse

import mlflow
from mlflow.models import infer_signature

warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Load & Preprocess Dataset

In [14]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,Environment,Required_Strength,Durability_Priority,Eco_Preference,Application_Area,Max_Lead_Time,Material_Name
0,Coastal,High,9,Yes,Bridge Support,30,Stainless Steel 316
1,Dry,Medium,6,Yes,Interior Wall Frame,10,Plywood
2,Humid,Low,5,Yes,Furniture,7,Teak Wood
3,Dry,High,8,No,Gear Components,20,Carbon Steel
4,Coastal,Medium,7,Yes,Window Frames,15,Aluminum 6061
...,...,...,...,...,...,...,...
1086,Dry,Medium,5,Yes,Solar Structures,1,Polycarbonate
1087,Coastal,Low,4,No,Marine Design,1,Aluminum 6061
1088,Humid,Medium,6,Yes,Water Network,6,HDPE
1089,Dry,High,8,No,Metal Fabrication,19,Carbon Steel


In [15]:
len(data)

1091

In [16]:
## Check for missing values
data.isnull().sum()

Environment            0
Required_Strength      0
Durability_Priority    0
Eco_Preference         0
Application_Area       0
Max_Lead_Time          0
Material_Name          0
dtype: int64

In [17]:
## data types
data.dtypes

Environment            object
Required_Strength      object
Durability_Priority     int64
Eco_Preference         object
Application_Area       object
Max_Lead_Time           int64
Material_Name          object
dtype: object

## Define Dependent and Independent Variables

In [18]:
X = data.drop(columns=['Material_Name', 'Application_Area'], axis=1)
y = data['Material_Name']

## Encode categorical features

In [19]:
# Initialize encoder
le = LabelEncoder()

In [20]:
# Fit encoders on respective columns
X["Environment"] = le.fit_transform(X["Environment"])
X["Required_Strength"] = le.fit_transform(X["Required_Strength"])
X["Eco_Preference"] = le.fit_transform(X["Eco_Preference"])
y = le.fit_transform(y) 

## Feature Scaling

In [21]:
scaler = StandardScaler()
columns_to_scale = ['Durability_Priority', 'Max_Lead_Time']
for col in columns_to_scale:
    X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
X

Unnamed: 0,Environment,Required_Strength,Durability_Priority,Eco_Preference,Max_Lead_Time
0,0,0,1.165124,1,0.834173
1,1,2,-0.092196,1,-0.919865
2,2,1,-0.511302,1,-1.182971
3,1,0,0.746017,0,-0.042846
4,0,2,0.326911,1,-0.481356
...,...,...,...,...,...
1086,1,2,-0.511302,1,-1.709182
1087,0,1,-0.930409,0,-1.709182
1088,2,2,-0.092196,1,-1.270673
1089,1,0,0.746017,0,-0.130548


## Save label encoders and scaler

In [41]:
pickle.dump(scaler, open("pickle_files/scaler.pkl", "wb"))
pickle.dump(le, open("pickle_files/encoder.pkl", "wb"))

## Split dataset

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Generate MLflow model signature

In [24]:
signature = infer_signature(X_train, y_train)



## Define Models and Hyperparameters

In [25]:
models = {
    "LogisticRegression": (LogisticRegression(), {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],  
        'penalty': ['l1', 'l2'],
        'max_iter': [100, 200]
    }),
    "SVC": (SVC(probability=True), {
        'C': [0.1, 1],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'degree': [2, 3]
    }),
    "KNeighborsClassifier": (KNeighborsClassifier(), {
        'n_neighbors': [3, 5],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]
    }),
    "GaussianNB": (GaussianNB(), {
        'var_smoothing': [1e-9, 1e-8]
    }),
    "DecisionTreeClassifier": (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2', None]
    }),
    "RandomForestClassifier": (RandomForestClassifier(), {
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2']
    })
}

## Train and Log Models to MLflow

In [26]:
# MLflow URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [None]:
# Loop through each model
for model_name, (model, param_grid) in models.items():
    try:
        with mlflow.start_run(run_name=model_name):
            print(f"\n--- Running {model_name} ---")

            # Grid Search
            grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            for param, value in grid_search.best_params_.items():
                mlflow.log_param(param, value)

            # Predict
            y_pred = best_model.predict(X_test)

            # Metrics
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            cm = confusion_matrix(y_test, y_pred)
            report = classification_report(y_test, y_pred)

            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_text(str(cm), "confusion_matrix.txt")
            mlflow.log_text(report, "classification_report.txt")

            # Log model
            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(best_model, "model", registered_model_name=f"{model_name}_Model", signature=signature)
            else:
                mlflow.sklearn.log_model(best_model, "model", signature=signature)

            print(f"{model_name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

    except Exception as e:
        print(f"Error running {model_name}: {e}")


--- Running LogisticRegression ---
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'LogisticRegression_Model'.
2025/05/02 20:34:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_Model, version 1
Created version '1' of model 'LogisticRegression_Model'.


LogisticRegression - Accuracy: 0.3516, F1 Score: 0.2873
🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/0/runs/f365938f618b4dd59995d78c14c87b46
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0

--- Running SVC ---
Fitting 3 folds for each of 16 candidates, totalling 48 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'SVC_Model'.
2025/05/02 20:34:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVC_Model, version 1
Created version '1' of model 'SVC_Model'.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVC - Accuracy: 0.5251, F1 Score: 0.4965
🏃 View run SVC at: http://127.0.0.1:5000/#/experiments/0/runs/5f8285bb9133490bbd82e7579d0bae8a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0

--- Running KNeighborsClassifier ---
Fitting 3 folds for each of 16 candidates, totalling 48 fits


Successfully registered model 'KNeighborsClassifier_Model'.
2025/05/02 20:34:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNeighborsClassifier_Model, version 1
Created version '1' of model 'KNeighborsClassifier_Model'.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNeighborsClassifier - Accuracy: 0.8128, F1 Score: 0.8097
🏃 View run KNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/0/runs/ec6d70a9ffbf41c0b5345172237c1ae3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0

--- Running GaussianNB ---
Fitting 3 folds for each of 2 candidates, totalling 6 fits


Successfully registered model 'GaussianNB_Model'.
2025/05/02 20:34:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GaussianNB_Model, version 1
Created version '1' of model 'GaussianNB_Model'.


GaussianNB - Accuracy: 0.2968, F1 Score: 0.2617
🏃 View run GaussianNB at: http://127.0.0.1:5000/#/experiments/0/runs/19e319c234fe43a4a65900e4b56403e1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0

--- Running DecisionTreeClassifier ---
Fitting 3 folds for each of 72 candidates, totalling 216 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'DecisionTreeClassifier_Model'.
2025/05/02 20:34:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeClassifier_Model, version 1
Created version '1' of model 'DecisionTreeClassifier_Model'.


DecisionTreeClassifier - Accuracy: 0.7900, F1 Score: 0.7868
🏃 View run DecisionTreeClassifier at: http://127.0.0.1:5000/#/experiments/0/runs/918027ddb25b46b69a09840035a0522a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0

--- Running RandomForestClassifier ---
Fitting 3 folds for each of 96 candidates, totalling 288 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'RandomForestClassifier_Model'.
2025/05/02 20:34:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier_Model, version 1


RandomForestClassifier - Accuracy: 0.8402, F1 Score: 0.8378
🏃 View run RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/0/runs/2d5104349c70436a8bf13b256e84be8f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Created version '1' of model 'RandomForestClassifier_Model'.


## Make Prediction on a New Sample

In [27]:
input_data = pd.DataFrame([{
    "Environment": 2,
    "Required_Strength": 0,
    "Durability_Priority": 0.326911,
    "Eco_Preference": 0,
    "Max_Lead_Time": -0.393654
}])

In [28]:
input_data

Unnamed: 0,Environment,Required_Strength,Durability_Priority,Eco_Preference,Max_Lead_Time
0,2,0,0.326911,0,-0.393654


In [29]:
# Load from specific run
model = mlflow.pyfunc.load_model('runs:/2d5104349c70436a8bf13b256e84be8f/model')

# Predict using the loaded model
predictions = model.predict(input_data)

print(predictions)

[5]


## Move Trained Model (model.pkl) to `pickle_files` Folder

In [30]:
run_id = "2d5104349c70436a8bf13b256e84be8f"  # Update with your actual run ID
experiment_id = "0"  # Optional, depending on your use case

# MLflow artifact directory (get model artifact path from run ID)
artifact_path = f"mlartifacts/{experiment_id}/{run_id}/artifacts/model/model.pkl"  # Update with the correct model artifact location

# Define the destination directory
pickle_files_dir = "pickle_files"  # Update if necessary
os.makedirs(pickle_files_dir, exist_ok=True)  # Ensure the directory exists

# Define the destination path for the model
destination_path = os.path.join(pickle_files_dir, "model.pkl")

# Copy the model file from the artifact path to the pickle_files directory
shutil.copy(artifact_path, destination_path)

print(f"Model successfully copied to {destination_path}")


Model successfully copied to pickle_files\model.pkl


## Load Model from Pickle and Predict Again

In [42]:
# Load model and preprocessors
model = pickle.load(open("pickle_files/model.pkl", "rb"))
scaler = pickle.load(open("pickle_files/scaler.pkl", "rb"))
encoder = pickle.load(open("pickle_files/encoder.pkl", "rb"))

In [43]:
# Sample input
raw_input = {
    "Environment": 'Coastal',
    "Required_Strength": 'Low',
    "Durability_Priority": 9,
    "Eco_Preference": 'Yes',
    "Max_Lead_Time": 30
}

environment_map = {'Coastal':0, 'Dry':1, 'Humid':2}
eco_preference_map = {'Yes':1, 'No':0}
required_strength_map = {'Low':0, 'Medium':1, 'High':2}
input_df = pd.DataFrame([raw_input])

input_df['Environment'] = input_df['Environment'].map(environment_map)
input_df['Required_Strength'] = input_df['Required_Strength'].map(required_strength_map)
input_df['Eco_Preference'] = input_df['Eco_Preference'].map(eco_preference_map)
input_df['Durability_Priority'] = scaler.transform(input_df['Durability_Priority'].values.reshape(-1, 1))
input_df['Max_Lead_Time'] = scaler.transform(input_df['Max_Lead_Time'].values.reshape(-1, 1))

In [44]:
predicted_class = model.predict(input_df)

material_name = encoder.inverse_transform(predicted_class)

print("Recommended Material:", material_name[0])

Recommended Material: Plywood
