# AI - Based Material Recommendation System

## Import Required Libraries

In [None]:
import pandas as pd
import os
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, f1_score
)

## Load & Preprocess Dataset

In [2]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,Material_Name,Cost_per_Unit ($),Load_Bearing_Capacity (tons),UV_Resistance (1-10),Durability (years),Maintenance_Frequency (years),Corrosion_Resistance (1-10)
0,Reinforced Concrete,137,42,5,58,11,9
1,Reinforced Concrete,133,57,6,57,8,7
2,Reinforced Concrete,144,44,6,60,8,8
3,Reinforced Concrete,139,54,6,59,11,8
4,Reinforced Concrete,147,44,5,59,9,8
...,...,...,...,...,...,...,...
995,Magnesium Oxide Board,101,13,6,58,15,9
996,Magnesium Oxide Board,104,15,5,49,15,10
997,Magnesium Oxide Board,102,12,5,58,13,9
998,Magnesium Oxide Board,104,15,5,52,18,10


In [3]:
len(data)

1000

In [4]:
## Check for missing values
data.isnull().sum()

Material_Name                    0
Cost_per_Unit ($)                0
Load_Bearing_Capacity (tons)     0
UV_Resistance (1-10)             0
Durability (years)               0
Maintenance_Frequency (years)    0
Corrosion_Resistance (1-10)      0
dtype: int64

In [5]:
## data types
data.dtypes

Material_Name                    object
Cost_per_Unit ($)                 int64
Load_Bearing_Capacity (tons)      int64
UV_Resistance (1-10)              int64
Durability (years)                int64
Maintenance_Frequency (years)     int64
Corrosion_Resistance (1-10)       int64
dtype: object

## Define Dependent and Independent Variables

In [6]:
X = data.drop(columns=['Material_Name'], axis=1)
y = data['Material_Name']

## Encode categorical features

In [7]:
# Initialize encoder
le = LabelEncoder()

In [8]:
# Fit encoders on respective columns
y = le.fit_transform(y) 

## Feature Scaling

In [12]:
scaler = StandardScaler()
columns_to_scale = ['Cost_per_Unit ($)', 'Load_Bearing_Capacity (tons)', 'UV_Resistance (1-10)', 'Durability (years)', 'Maintenance_Frequency (years)', 'Corrosion_Resistance (1-10)']
for col in columns_to_scale:
    X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
X

Unnamed: 0,Cost_per_Unit ($),Load_Bearing_Capacity (tons),UV_Resistance (1-10),Durability (years),Maintenance_Frequency (years),Corrosion_Resistance (1-10)
0,-0.174566,0.072490,-0.872483,-0.032794,-0.546736,0.475450
1,-0.223037,0.575661,-0.268689,-0.074729,-0.919680,-0.790728
2,-0.089743,0.139580,-0.268689,0.051078,-0.919680,-0.157639
3,-0.150331,0.475027,-0.268689,0.009142,-0.546736,-0.157639
4,-0.053390,0.139580,-0.872483,0.009142,-0.795365,-0.157639
...,...,...,...,...,...,...
995,-0.610800,-0.900308,-0.268689,-0.032794,-0.049477,0.475450
996,-0.574447,-0.833218,-0.872483,-0.410214,-0.049477,1.108539
997,-0.598682,-0.933852,-0.872483,-0.032794,-0.298107,0.475450
998,-0.574447,-0.833218,-0.872483,-0.284407,0.323467,1.108539


## Save label encoders and scaler

In [14]:
pickle.dump(scaler, open("pickle_files/scaler.pkl", "wb"))
pickle.dump(le, open("pickle_files/encoder.pkl", "wb"))

## Split dataset

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Define Models and Hyperparameters

In [16]:
models = {
    "LogisticRegression": (LogisticRegression(), {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],  
        'penalty': ['l1', 'l2'],
        'max_iter': [100, 200]
    }),
    "SVC": (SVC(probability=True), {
        'C': [0.1, 1],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'degree': [2, 3]
    }),
    "KNeighborsClassifier": (KNeighborsClassifier(), {
        'n_neighbors': [3, 5],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]
    }),
    "GaussianNB": (GaussianNB(), {
        'var_smoothing': [1e-9, 1e-8]
    }),
    "DecisionTreeClassifier": (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2', None]
    }),
    "RandomForestClassifier": (RandomForestClassifier(), {
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2']
    })
}

## Perform HyperParameter Tuning

In [23]:
# Loop through each model
for model_name, (model, param_grid) in models.items():
    try:
        print(f"\n--- Running {model_name} ---")

        # Grid Search
        grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        # Predict
        y_pred = best_model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        print(f"{model_name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
        for param, value in grid_search.best_params_.items():
                print(param, value)

    except Exception as e:
        print(f"Error running {model_name}: {e}")


--- Running LogisticRegression ---
Fitting 3 folds for each of 32 candidates, totalling 96 fits




LogisticRegression - Accuracy: 0.9500, F1 Score: 0.9496
C 10
max_iter 200
penalty l1
solver saga

--- Running SVC ---
Fitting 3 folds for each of 16 candidates, totalling 48 fits
SVC - Accuracy: 0.9750, F1 Score: 0.9751
C 1
degree 2
gamma scale
kernel linear

--- Running KNeighborsClassifier ---
Fitting 3 folds for each of 16 candidates, totalling 48 fits
KNeighborsClassifier - Accuracy: 0.9300, F1 Score: 0.9295
metric manhattan
n_neighbors 5
p 1
weights distance

--- Running GaussianNB ---
Fitting 3 folds for each of 2 candidates, totalling 6 fits
GaussianNB - Accuracy: 0.9900, F1 Score: 0.9900
var_smoothing 1e-09

--- Running DecisionTreeClassifier ---
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DecisionTreeClassifier - Accuracy: 0.9300, F1 Score: 0.9299
criterion entropy
max_depth None
max_features None
min_samples_leaf 1
min_samples_split 2

--- Running RandomForestClassifier ---
Fitting 3 folds for each of 96 candidates, totalling 288 fits
RandomForestClassifier 

## Make Prediction on a New Sample

In [20]:
input_data = pd.DataFrame([{
    "Cost_per_Unit ($)": -0.174566,
    "Load_Bearing_Capacity (tons)": 0.072490,
    "UV_Resistance (1-10)": -0.872483	,
    "Durability (years)": -0.032794	,
    "Maintenance_Frequency (years)": -0.546736,
    "Corrosion_Resistance (1-10)": 0.475450
}])


In [21]:
input_data

Unnamed: 0,Cost_per_Unit ($),Load_Bearing_Capacity (tons),UV_Resistance (1-10),Durability (years),Maintenance_Frequency (years),Corrosion_Resistance (1-10)
0,-0.174566,0.07249,-0.872483,-0.032794,-0.546736,0.47545


### Based on the model training and the accuracies achieved, although Gaussian Naive Bayes shows a high accuracy of 99%, it appears to suffer from overfitting. Therefore, we consider Random Forest, which has the next best accuracy of 98%, as the more reliable choice.

In [25]:
final_model = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='log2'
)

final_model.fit(X_train, y_train)

In [26]:
# Predict using the loaded model
predictions = final_model.predict(input_data)
print(predictions)

[15]


## Pickled the Final Model and Move Trained Model (model.pkl) to `pickle_files` Folder

In [28]:
# Define folder and file path
folder_path = "pickle_files"
model_path = os.path.join(folder_path, "model.pkl")

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the model to the specified path
with open(model_path, "wb") as file:
    pickle.dump(final_model, file)

print(f"Model saved to: {model_path}")

Model saved to: pickle_files\model.pkl


## Load Model from Pickle and Predict Again

In [29]:
# Load model and preprocessors
model = pickle.load(open("pickle_files/model.pkl", "rb"))
scaler = pickle.load(open("pickle_files/scaler.pkl", "rb"))
encoder = pickle.load(open("pickle_files/encoder.pkl", "rb"))

In [30]:
# Sample input
raw_input = {
    "Cost_per_Unit ($)": 130,
    "Load_Bearing_Capacity (tons)": 50,
    "UV_Resistance (1-10)": 6,
    "Durability (years)": 55,
    "Maintenance_Frequency (years)": 10,
    "Corrosion_Resistance (1-10)": 8
}
input_df = pd.DataFrame([raw_input])

input_df['Cost_per_Unit ($)'] = scaler.transform(input_df['Cost_per_Unit ($)'].values.reshape(-1, 1))
input_df['Load_Bearing_Capacity (tons)'] = scaler.transform(input_df['Load_Bearing_Capacity (tons)'].values.reshape(-1, 1))
input_df['UV_Resistance (1-10)'] = scaler.transform(input_df['UV_Resistance (1-10)'].values.reshape(-1, 1))
input_df['Durability (years)'] = scaler.transform(input_df['Durability (years)'].values.reshape(-1, 1))
input_df['Maintenance_Frequency (years)'] = scaler.transform(input_df['Maintenance_Frequency (years)'].values.reshape(-1, 1))
input_df['Corrosion_Resistance (1-10)'] = scaler.transform(input_df['Corrosion_Resistance (1-10)'].values.reshape(-1, 1))

In [31]:
predicted_class = model.predict(input_df)

material_name = encoder.inverse_transform(predicted_class)

print("Recommended Material:", material_name[0])

Recommended Material: Carbon Fiber Composite
