In [68]:
# Import libraries
try:
    # Importing general libraries
    import glob
    import pandas as pd

    # Importing libraries for model building
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

    # Importing libraries for data preprocessing
    from scipy.stats import randint

except Exception as e:
    print(f"Error : {e}")

In [69]:
# Find the CSV file in the Datasets directory
data_path = '../Datasets/*.csv'
file_list = glob.glob(data_path)

for file in file_list:
    print(f"Found file: {file}")

# Ensure there is exactly one file
if len(file_list) == 1:
    # Load the dataset
    df = pd.read_csv(file_list[0])
    print(f"Loaded dataset: {file_list[0]}")
else:
    raise FileNotFoundError("No CSV file found or multiple CSV files found in the Datasets directory.")

Found file: ../Datasets/Dataset.csv
Loaded dataset: ../Datasets/Dataset.csv


In [70]:
# Define the categorical columns
categorical_cols_unified = ['partType', 'microstructure', 'seedLocation', 'castType']

# Create a copy of the DataFrame to preserve the original
df_onehot_encoded = df.copy()
encoder = "One-Hot Encoding"

# Initialize OneHotEncoder with integer output
# one_hot_encoder = OneHotEncoder(dtype=int)
one_hot_encoder = OneHotEncoder()

# Fit and transform the specified categorical columns
encoded_array = one_hot_encoder.fit_transform(df_onehot_encoded[categorical_cols_unified])

# Convert the encoded array to a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=one_hot_encoder.get_feature_names_out(categorical_cols_unified))

# Concatenate the encoded columns with the original DataFrame (excluding original categorical columns)
df_onehot_encoded = pd.concat([df_onehot_encoded.drop(columns=categorical_cols_unified), encoded_df], axis=1)

# Display the first few rows to verify
display(df_onehot_encoded.head())

Unnamed: 0,Lifespan,coolingRate,quenchTime,forgeTime,HeatTreatTime,Nickel%,Iron%,Cobalt%,Chromium%,smallDefects,...,partType_Nozzle,partType_Valve,microstructure_colGrain,microstructure_equiGrain,microstructure_singleGrain,seedLocation_Bottom,seedLocation_Top,castType_Continuous,castType_Die,castType_Investment
0,1469.17,13,3.84,6.47,46.87,65.73,16.52,16.82,0.93,10,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1793.64,19,2.62,3.48,44.7,54.22,35.38,6.14,4.26,19,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,700.6,28,0.76,1.34,9.54,51.83,35.95,8.81,3.41,35,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1082.1,9,2.01,2.19,20.29,57.03,23.33,16.86,2.78,0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1838.83,16,4.13,3.87,16.13,59.62,27.37,11.45,1.56,10,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [71]:
df_onehot_encoded.shape

(1000, 24)

In [72]:
# Define the target variable and feature set
X = df_onehot_encoded.drop(columns=['Lifespan'])  # Features
y = df_onehot_encoded['Lifespan']  # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to verify
print(f'--- {encoder} Shape ---\n')
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

--- One-Hot Encoding Shape ---

X_train shape: (800, 23)
X_test shape: (200, 23)
y_train shape: (800,)
y_test shape: (200,)


In [73]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None]  # Corrected values for max_features
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator and parameters
best_rf_model = grid_search.best_estimator_
print("Best parameters found by GridSearchCV:", grid_search.best_params_)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_

In [74]:
# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 500),  # Randomly sample number of trees between 100 and 500
    'max_depth': [None] + list(range(10, 50, 5)),  # None or range from 10 to 50, step 5
    'min_samples_split': randint(2, 20),  # Random split values between 2 and 20
    'min_samples_leaf': randint(1, 10),  # Random leaf values between 1 and 10
    'max_features': ['sqrt', 'log2', None]  # Use predefined feature subsets
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1,
                                   scoring='neg_mean_squared_error')

# Fit the model to the training data
random_search.fit(X_train, y_train)

# Get the best estimator and parameters
best_rf_model = random_search.best_estimator_
print("Best parameters found by RandomizedSearchCV:", random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=35, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   0.2s
[CV] END max_depth=35, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   0.3s
[CV] END max_depth=35, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   0.3s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=5, min_samples_split=2, n_estimators=413; total time=   0.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=9, min_samples_split=18, n_estimators=158; total time=   0.4s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=9, min_samples_split=18, n_estimators=158; total time=   0.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=5, n_estimators=459; total time=   0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=5, n_estimators

In [75]:
# # Initialize the Random Forest Regressor with best parameters
# rf_model = RandomForestRegressor(n_estimators=387, max_depth=15, random_state=42)

# Initialize the Random Forest Regressor with default parameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance ---\n')
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R\u00b2 Score: {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Log Error (MSLE): {msle:.2f}")

--- One-Hot Encoding Performance ---

Root Mean Squared Error (RMSE): 85.15
R² Score: 0.93
Mean Absolute Error (MAE): 67.46
Mean Squared Log Error (MSLE): 0.01
