In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns

In [2]:
df = pd.read_csv("Data/construction_cost_prediction_dataset.csv")

In [3]:
# Step 1: Data Preprocessing
# Check for missing values
if df.isnull().sum().any():
    print("Missing values detected. Handling missing values...")
    df.fillna(df.mean(), inplace=True)

In [4]:
# Encode categorical variables
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [5]:
df.columns

Index(['Project_ID', 'Project_Type', 'Total_Area_SqFt', 'Number_of_Floors',
       'Material_Cost_per_SqFt', 'Labor_Cost_per_SqFt',
       'Project_Duration_Months', 'Location_Type', 'Transportation_Cost',
       'Inflation_Rate', 'Complexity', 'Total_Construction_Cost'],
      dtype='object')

In [6]:
# Separate features (X) and target (y)
X = df.drop(columns=['Total_Construction_Cost','Project_ID']) # Features
y = df['Total_Construction_Cost']  # Target variable

In [7]:
# Step 2: Identify Numerical and Categorical Columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [8]:
# Step 3: Create a Preprocessor
# Standardize numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

In [10]:
# Step 4: Create the Pipeline
# Pipeline for preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Regressor', RandomForestRegressor(random_state=42, n_estimators=100))
])

In [19]:
# Step 5: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Step 6: Train the Model
pipeline.fit(X_train, y_train)

In [22]:
# Step 7: Make Predictions
y_pred = pipeline.predict(X_test)

In [23]:
from sklearn import metrics
import numpy as np
def print_evaluate(true,predicted):
    mae = metrics.mean_absolute_error(true,predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true,predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('\nMAE:', mae)
    print('\nMSE:', mse)
    print('\nRMSE:', rmse)
    print('\nR2 Square', r2_square)

In [24]:
print("Error/ Accuracy Analysis :- ")
print_evaluate(y_test,pipeline.predict(X_test))

Error/ Accuracy Analysis :- 

MAE: 167622.7615201

MSE: 54277838946.76342

RMSE: 232976.04801087047

R2 Square 0.8931443673736038


In [25]:
# Step 9: Save the Pipeline (Optional)
import joblib
joblib.dump(pipeline, 'cost_predicton_pipeline.pkl')

['cost_predicton_pipeline.pkl']

In [42]:
import pandas as pd

# Load the .pkl file into a DataFrame
df12 = pd.read_pickle('cost_predicton_pipeline.pkl')

# # Get the column names
# column_names = df12.columns

# # Print the column names
# print(column_names)


In [44]:
df12

array(['Total_Area_SqFt', 'Number_of_Floors', 'Material_Cost_per_SqFt',
       'Labor_Cost_per_SqFt', 'Project_Duration_Months',
       'Transportation_Cost', 'Inflation_Rate'], dtype=object)