In [9]:
!pip install pandas numpy scikit-learn matplotlib seaborn

!pip install lightgbm




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np 


data = pd.read_csv("train1.csv")


data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)

data.dropna(subset=['price'], inplace=True)

data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)


X = data.drop(columns=['price'])
y = data['price']


categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")


Mean Squared Error: 5664227915.19
Root Mean Squared Error: 75261.07


In [11]:
!pip install xgboost




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


data = pd.read_csv("train1.csv")
a=data['brand'].unique()

data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)


data.dropna(subset=['price'], inplace=True)


data['price'] = np.log1p(data['price']) 


data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)


Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]


X = data.drop(columns=['price'])
y = data['price']


categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
}


results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}


best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model_results = results[best_model_name]


print("Model Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  MSE: {metrics['MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R2: {metrics['R2']:.2f}")

print(f"\nBest Model: {best_model_name}")
print(f"  Best R2 Score: {best_model_results['R2']:.2f}")


best_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', models[best_model_name])
])
best_model_pipeline.fit(X_train, y_train)


y_pred_final = best_model_pipeline.predict(X_test)
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_final)


final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print(f"\nFinal Metrics (on original scale):")
print(f"  MSE: {final_mse:.2f}")
print(f"  RMSE: {final_rmse:.2f}")
print(f"  R2: {final_r2:.2f}")
print(a)

Model Evaluation Results:

Random Forest:
  MSE: 0.26
  RMSE: 0.51
  R2: 0.58

Gradient Boosting:
  MSE: 0.23
  RMSE: 0.48
  R2: 0.62

XGBoost:
  MSE: 0.23
  RMSE: 0.48
  R2: 0.62

Best Model: XGBoost
  Best R2 Score: 0.62

Final Metrics (on original scale):
  MSE: 683185381.20
  RMSE: 26137.82
  R2: 0.37
['MINI' 'Lincoln' 'Chevrolet' 'Genesis' 'Mercedes-Benz' 'Audi' 'Ford'
 'BMW' 'Tesla' 'Cadillac' 'Land' 'GMC' 'Toyota' 'Hyundai' 'Volvo'
 'Volkswagen' 'Buick' 'Rivian' 'RAM' 'Hummer' 'Alfa' 'INFINITI' 'Jeep'
 'Porsche' 'McLaren' 'Honda' 'Lexus' 'Dodge' 'Nissan' 'Jaguar' 'Acura'
 'Kia' 'Mitsubishi' 'Rolls-Royce' 'Maserati' 'Pontiac' 'Saturn' 'Bentley'
 'Mazda' 'Subaru' 'Ferrari' 'Aston' 'Lamborghini' 'Chrysler' 'Lucid'
 'Lotus' 'Scion' 'smart' 'Karma' 'Plymouth' 'Suzuki' 'FIAT' 'Saab'
 'Bugatti' 'Mercury' 'Polestar' 'Maybach']


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


data = pd.read_csv("train1.csv")


data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)


data.dropna(subset=['price'], inplace=True)


data['price'] = np.log1p(data['price']) 


data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)


Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]


X = data.drop(columns=['price'])
y = data['price']

categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


xgboost = XGBRegressor(objective='reg:squarederror', random_state=42)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgboost)
])


param_grid = {
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}


grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)


print(f"\nFinal Metrics:")
print(f"  MSE: {mse:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  R2: {r2:.2f}")


y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)


final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print(f"\nFinal Metrics on Original Scale:")
print(f"  MSE: {final_mse:.2f}")
print(f"  RMSE: {final_rmse:.2f}")
print(f"  R2: {final_r2:.2f}")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 300, 'regressor__subsample': 1.0}

Final Metrics:
  MSE: 0.23
  RMSE: 0.48
  R2: 0.62

Final Metrics on Original Scale:
  MSE: 679981016.91
  RMSE: 26076.45
  R2: 0.38


In [6]:
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score

# Define the Ridge Regression model
ridge = Ridge(random_state=42)

# Update the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Define hyperparameter grid for Ridge Regression
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 100, 1000],
    'regressor__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag', 'saga']
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

accuracy = (1 - final_rmse / np.mean(y_test_actual)) * 100

print(f"Final MSE on Original Scale: {final_mse:.2f}")
print(f"Final RMSE on Original Scale: {final_rmse:.2f}")
print(f"Final R2 on Original Scale: {final_r2:.2f}")
print(f"Accuracy: {accuracy:.2f}%")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'regressor__alpha': 1, 'regressor__solver': 'saga'}
MSE: 0.25
RMSE: 0.50
R2: 0.59
Final MSE on Original Scale: 721176741.74
Final RMSE on Original Scale: 26854.73
Final R2 on Original Scale: 0.34
Accuracy: 30.69%


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score



data = pd.read_csv("train1.csv")

def extract_horsepower(engine_string):
    match = re.search(r'(\d+\.?\d*)\s*HP', str(engine_string))
    if match:
        return float(match.group(1))
    return np.nan


data['horsepower'] = data['engine'].apply(extract_horsepower)


data = data.dropna(subset=['horsepower'])


data.drop(columns=['engine'], inplace=True)


data.drop(columns=['id', 'brand', 'model'], inplace=True)


data.dropna(subset=['price'], inplace=True)


data['price'] = np.log1p(data['price']) 


data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)


Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]


X = data.drop(columns=['price'])
y = data['price']


categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


xgboost = XGBRegressor(objective='reg:squarederror', random_state=42)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgboost)
])


param_grid = {
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}


grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")


best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nFinal Metrics:")
print(f"  MSE: {mse:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  R2: {r2:.2f}")

y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print(f"\nFinal Metrics on Original Scale:")
print(f"  MSE: {final_mse:.2f}")
print(f"  RMSE: {final_rmse:.2f}")
print(f"  R2: {final_r2:.2f}")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 7, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}

Final Metrics:
  MSE: 0.19
  RMSE: 0.44
  R2: 0.70

Final Metrics on Original Scale:
  MSE: 480242160.66
  RMSE: 21914.43
  R2: 0.51
