In [1]:
import pandas as pd

In [2]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of the training data
print(train_df.head())

   id    brand          model  model_year  milage fuel_type  \
0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
1   1      BMW          335 i        2007   80000  Gasoline   
2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
4   4  Pontiac  Firebird Base        2001  111000  Gasoline   

                                              engine  \
0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col       accident clean_title  \
0                    10-Speed A/T    Blue    Gray  None reported         Yes   
1                     6-Speed M/T   Black   Black  None reported         Yes   
2                     6-Speed A/T  Purple   

In [3]:
# Fill missing values for numerical columns
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.drop('price')
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].median())
test_df[numerical_cols] = test_df[numerical_cols].fillna(test_df[numerical_cols].median())

# Fill missing values for categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns
train_df[categorical_cols] = train_df[categorical_cols].fillna(train_df[categorical_cols].mode().iloc[0])
test_df[categorical_cols] = test_df[categorical_cols].fillna(test_df[categorical_cols].mode().iloc[0])


In [4]:
# Create new feature: car_age
train_df['car_age'] = 2024 - train_df['model_year']
test_df['car_age'] = 2024 - test_df['model_year']
train_df.drop(['model_year'], axis=1, inplace=True)
test_df.drop(['model_year'], axis=1, inplace=True)


In [5]:
# Convert categorical variables to dummy/indicator variables
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

# Ensure the train and test datasets have the same columns
train_df, test_df = train_df.align(test_df, join='left', axis=1)
test_df.fillna(0, inplace=True)


In [6]:
from sklearn.model_selection import train_test_split

# Features and target
X = train_df.drop('price', axis=1)
y = train_df['price']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Root Mean Squared Error: {rmse}')


In [None]:
# Predict on the test data
test_predictions = model.predict(test_df)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_df.index,  # Assuming the index should be used for Id
    'Price': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)
