In [100]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [101]:
df = pd.read_csv('final_cleaned.csv')
df.drop(columns=["Unnamed: 0"],inplace=True)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7238 entries, 0 to 7237
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          7238 non-null   object
 1   fuel_type     7238 non-null   object
 2   body_type     7235 non-null   object
 3   km_driven     7238 non-null   int64 
 4   transmission  7238 non-null   object
 5   ownerno       7238 non-null   int64 
 6   model         7238 non-null   object
 7   modelyear     7238 non-null   int64 
 8   color         7238 non-null   object
 9   price         7238 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 565.6+ KB


In [103]:
# Fit and store label encoders for categorical columns
categorical_cols = df.select_dtypes(include=[object]).columns
for column in categorical_cols:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

In [104]:
df.head()

Unnamed: 0,city,fuel_type,body_type,km_driven,transmission,ownerno,model,modelyear,color,price
0,0,4,2,120000,1,3,123,2015,127,400000
1,0,4,6,32706,1,2,39,2018,127,811000
2,0,4,2,11949,1,1,203,2018,102,585000
3,0,4,7,17794,1,1,72,2014,81,462000
4,0,1,6,60000,1,1,140,2015,53,790000


In [105]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state=42,test_size=0.2)

GradientBoostingRegressor

In [106]:

# Initialize and train the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=300, random_state=42,learning_rate = 0.2, max_depth = 5, min_samples_leaf = 4, min_samples_split = 2)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

def evaluation():
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Mean Absolute Error:", mae)
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)


GradientBoostingRegressor (GridSearchCV)

In [107]:
# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)

# Set up GridSearchCV with K-Fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit the GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_


# Make predictions on the test data
y_pred = best_model.predict(X_test)

evaluation()

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mean Absolute Error: 67609.73128656247
Mean Squared Error: 10216574039.250807
R-squared: 0.9209265221405626


RandomForestRegressor

In [108]:
from sklearn.ensemble import RandomForestRegressor
# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

evaluation()

Mean Absolute Error: 78265.53855886609
Mean Squared Error: 14277226343.118622
R-squared: 0.8894981882576799


RandomForestRegressor (GridSearchCV)

In [109]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

# Initialize the model
rf_model = RandomForestRegressor(random_state=42)

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

# Make predictions with the best model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

evaluation()

Fitting 5 folds for each of 48 candidates, totalling 240 fits


120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\New\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\New\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\New\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\New\AppData\Local\Programs\Python\Python312\Lib\site-p

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Absolute Error: 92021.66569930609
Mean Squared Error: 18950199720.62303
R-squared: 0.8533306573922225


# from sklearn.preprocessing import LabelEncoder
# import pandas as pd
# df.rename(columns= {'ft':"fuel_type",'bt':"body_type",'km':"km_driven"},inplace=True)

# # Step 1: Encode categorical columns
# categorical_cols = df.select_dtypes(include=[object]).columns
# encoders = {}

# for column in categorical_cols:
#     le = LabelEncoder()
#     df[column] = le.fit_transform(df[column])
#     encoders[column] = le  # Store encoder

# # Step 2: Detect and remove outliers
# numeric_cols = df.select_dtypes(include=[int, float]).columns
# mask = pd.Series([True] * len(df))

# for column in numeric_cols:
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
    
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     column_mask = (df[column] >= lower_bound) & (df[column] <= upper_bound)
#     mask = mask & column_mask

# df_cleaned = df[mask].reset_index(drop=True)

# # Step 3: Decode categorical columns
# for column in categorical_cols:
#     le = encoders[column]
    
#     # Check if any encoded values are missing after outlier removal
#     encoded_values_in_cleaned = df_cleaned[column].values
    
#     # Decode only the available encoded values
#     df_cleaned[column] = le.inverse_transform(encoded_values_in_cleaned)

# df_cleaned.to_csv("final_cleaned.csv")