In [6]:
!pip install pandas numpy scikit-learn matplotlib seaborn

!pip install lightgbm




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.5.0



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np 


data = pd.read_csv("train1.csv")


data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)

data.dropna(subset=['price'], inplace=True)

data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)


X = data.drop(columns=['price'])
y = data['price']


categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")


Mean Squared Error: 5664227915.19
Root Mean Squared Error: 75261.07


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset
data = pd.read_csv("train1.csv")

# Drop unnecessary columns
data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)

# Drop rows where 'price' is NaN
data.dropna(subset=['price'], inplace=True)

# Add a new feature 'vehicle_age'
data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)

# Separate features and target
X = data.drop(columns=['price'])
y = data['price']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing for numerical and categorical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

# Create a pipeline that combines preprocessing and modeling
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 5664227915.19
Root Mean Squared Error: 75261.07
R-squared: -0.02


In [16]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("train1.csv")

# Drop unnecessary columns
data.drop(columns=['id', 'brand', 'model', 'engine'], inplace=True)

# Drop rows where 'price' is NaN
data.dropna(subset=['price'], inplace=True)

# Handle skewness in the target variable
data['price'] = np.log1p(data['price'])  # Log transformation to handle skewness

# Add a new feature 'vehicle_age'
data['vehicle_age'] = 2024 - data['model_year']
data.drop(columns=['model_year'], inplace=True)

# Remove outliers using IQR method
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

# Separate features and target
X = data.drop(columns=['price'])
y = data['price']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.select_dtypes(include=['object']).columns if X[col].nunique() <= 3]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing for numerical and categorical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to test
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
}

# Evaluate each model
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Select the best model based on R2
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model_results = results[best_model_name]

# Display results
print("Model Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  MSE: {metrics['MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R2: {metrics['R2']:.2f}")

print(f"\nBest Model: {best_model_name}")
print(f"  Best R2 Score: {best_model_results['R2']:.2f}")

# Final best model pipeline
best_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', models[best_model_name])
])
best_model_pipeline.fit(X_train, y_train)

# Make predictions and inverse transform target if using log
y_pred_final = best_model_pipeline.predict(X_test)
y_test_actual = np.expm1(y_test)  # Convert log-transformed target back
y_pred_actual = np.expm1(y_pred_final)

# Final metrics on actual scale
final_mse = mean_squared_error(y_test_actual, y_pred_actual)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test_actual, y_pred_actual)

print(f"\nFinal Metrics (on original scale):")
print(f"  MSE: {final_mse:.2f}")
print(f"  RMSE: {final_rmse:.2f}")
print(f"  R2: {final_r2:.2f}")


Model Evaluation Results:

Random Forest:
  MSE: 0.26
  RMSE: 0.51
  R2: 0.58

Gradient Boosting:
  MSE: 0.23
  RMSE: 0.48
  R2: 0.62

XGBoost:
  MSE: 0.23
  RMSE: 0.48
  R2: 0.62

Best Model: XGBoost
  Best R2 Score: 0.62

Final Metrics (on original scale):
  MSE: 683185381.20
  RMSE: 26137.82
  R2: 0.37
