In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kagglex-cohort4-dataset/sample_submission.csv
/kaggle/input/kagglex-cohort4-dataset/train.csv
/kaggle/input/kagglex-cohort4-dataset/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import mean_squared_error

In [3]:
path = '/kaggle/input/kagglex-cohort4-dataset'
train_path =  path + '/train.csv'
test_path = path + '/test.csv'


train_df = pd.read_csv(train_path)
print(train_df.shape)
train_df.head()

(54273, 13)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [4]:
test_df = pd.read_csv(test_path)

In [5]:
X_train = train_df.drop('price', axis = 1)
y_train = train_df['price']

In [6]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Define column transformer for preprocessing
# Define column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [7]:

# Define PCA to reduce dimensions to, say, 50 components
pca = PCA(n_components=200)
svd = TruncatedSVD(n_components=200)

# Define the models
ridge = Ridge()
lasso = Lasso()
xgb = XGBRegressor(objective='reg:squarederror')

# Create a pipeline for each model
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('svd', svd),
                                 ('model', ridge)])
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('svd', svd),
                                 ('model', lasso)])
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('svd', svd),
                               ('model', xgb)])

# Define parameter grids
ridge_param_grid = {
    'model__alpha': [0.1, 1, 10, 100]
}

lasso_param_grid = {
    'model__alpha': [0.1, 1, 10, 100]
}

xgb_param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

In [8]:
# Perform GridSearchCV for each model
# ridge_grid_search = GridSearchCV(ridge_pipeline, ridge_param_grid, cv=5, scoring='neg_mean_squared_error')
# lasso_grid_search = GridSearchCV(lasso_pipeline, lasso_param_grid, cv=5, scoring='neg_mean_squared_error')
xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=5, scoring='neg_mean_squared_error')


In [9]:
# Fit the models
# ridge_grid_search.fit(X_train, y_train)
# lasso_grid_search.fit(X_train, y_train)
xgb_grid_search.fit(X_train, y_train)

In [10]:
# Get the best estimators
# best_ridge_model = ridge_grid_search.best_estimator_
# best_lasso_model = lasso_grid_search.best_estimator_
best_xgb_model = xgb_grid_search.best_estimator_

In [11]:
# Get the best scores
# ridge_best_score = -ridge_grid_search.best_score_
# lasso_best_score = -lasso_grid_search.best_score_
# xgb_best_score = -xgb_grid_search.best_score_

# # Find the best model based on the best score
# best_model = None
# if min(ridge_best_score, lasso_best_score, xgb_best_score) == ridge_best_score:
#     best_model = best_ridge_model
# elif min(ridge_best_score, lasso_best_score, xgb_best_score) == lasso_best_score:
#     best_model = best_lasso_model
# else:
#     best_model = best_xgb_model

best_model = best_xgb_model

In [12]:
test_df = pd.read_csv(test_path)
print(test_df.shape)
test_df.head()


(36183, 12)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes


In [13]:
X_test = test_df

In [14]:
test_predictions = best_model.predict(X_test)

In [15]:
test_submission = pd.read_csv('/kaggle/input/kagglex-cohort4-dataset/sample_submission.csv')

In [16]:
test_submission.head()

Unnamed: 0,id,price
0,54273,39218.443
1,54274,39218.443
2,54275,39218.443
3,54276,39218.443
4,54277,39218.443


In [17]:
test_submission['price'] = test_predictions

In [18]:
test_submission.head()

Unnamed: 0,id,price
0,54273,24795.486328
1,54274,20989.207031
2,54275,39455.605469
3,54276,54428.933594
4,54277,36254.542969


In [19]:
test_submission.to_csv('submission.csv', index=False)