 # *About Dataset*
A dataset of vehicle sales transactions, including make, model, year, condition, odometer, selling price, and MMR values. Useful for market analysis, price prediction, and business insights. Available in tabular format (e.g., CSV) and updated periodically.

# *The problem statement*

The automotive industry generates vast amounts of data about vehicle sales, pricing,
and market trends. However, analyzing and deriving insights from this data can be
challenging due to the complexity and volume of information. Additionally, predicting
vehicle prices accurately requires considering multiple factors like make, model, year,
condition, and mileage.

# *Key Columns :*

* year: Year of the car.


* make, model, trim, body: Basic car details.


* transmission: Type of transmission.


* condition: Condition score of the car.


* odometer: Mileage of the car.


* color, interior: Exterior and interior colors.


* mmr: Manheim Market Report price.


* sellingprice: Price at which the car was sold (Target variable).


* saledate: Date of sale.

# ***1. Import Required Libraries***

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ***2. Load and Inspect Data***

In [None]:
# Load the dataset
data = pd.read_csv("/content/car_prices.csv")
data
# Display first 5 rows
data.head()

# DataFrame Information
data.info()

# Descriptive statistics for categorical variables
data.describe(include=["object", "category"])

# Check unique values in each column
data.nunique()

# Check for duplicate VINs
data["vin"].value_counts()


In [None]:
print(data.to_string())

In [None]:
data.columns

In [None]:
data["make"].value_counts()

In [None]:
data.dtypes

# ***3. Data Cleaning***

In [None]:

# Filling missing values with appropriate strategies
data["make"].fillna(data["make"].mode()[0], inplace=True)
data["model"].fillna(data["model"].mode()[0], inplace=True)
data["trim"].fillna(data["trim"].mode()[0], inplace=True)
data["body"].fillna(data["body"].mode()[0], inplace=True)
data["transmission"].fillna(data["transmission"].mode()[0], inplace=True)
data["condition"].fillna(data["condition"].median(), inplace=True)
data["odometer"].fillna(data["odometer"].median(), inplace=True)
data["color"].fillna(data["color"].mode()[0], inplace=True)
data["interior"].fillna(data["interior"].mode()[0], inplace=True)
data["mmr"].fillna(data["mmr"].median(), inplace=True)
data["sellingprice"].fillna(data["sellingprice"].mean(), inplace=True)
data["saledate"].fillna(data["saledate"].mode()[0], inplace=True)

# Dropping unnecessary columns
data.drop(["vin", "seller"], axis=1, inplace=True)

# Convert 'saledate' to datetime format
data["saledate"] = pd.to_datetime(data["saledate"], errors="coerce")


# ***4. Exploratory Data Analysis (EDA)***

In [None]:
# Distribution of Selling Price
plt.figure(figsize=(8, 6))
sns.histplot(data["sellingprice"], bins=50, kde=True, color="skyblue")
plt.title("Distribution of Selling Price")
plt.xlabel("Selling Price ($)")
plt.ylabel("Frequency")
plt.show()

# Count plot of Transmission Types
plt.figure(figsize=(8, 6))
sns.countplot(x="transmission", data=data, palette="pastel")
plt.title("Count of Transmission Types")
plt.xlabel("Transmission")
plt.ylabel("Count")
plt.show()

# Plotting histograms for all numerical features
data.hist(edgecolor="black", figsize=(20, 14))
plt.suptitle("Histogram of Car Dataset Features", fontsize=20)
plt.show()


# ***5. Encode Categorical Variables***

In [None]:
# Encoding categorical variables using LabelEncoder
categorical_cols = [
    "make",
    "trim",
    "transmission",
    "color",
    "model",
    "body",
    "interior",
    "state",
    "mmr",
]
le = LabelEncoder()

for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Convert 'odometer' to numeric and handle errors
# Convert 'odometer' to numeric, handle errors, and convert to int
data['odometer'] = pd.to_numeric(data['odometer'], errors='coerce').astype('Int64')

# Check data types
data.dtypes


In [None]:
data.dtypes

In [None]:
data

# ***6. Outlier Detection and Removal***

In [None]:

# Define a function to remove outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[column] >= (Q1 - 1.5 * IQR)) & (df[column] <= (Q3 + 1.5 * IQR))]
    return df

# List of columns to remove outliers from
outlier_cols = ["year", "body", "make", "transmission", "odometer", "color", "interior", "mmr"]

# Apply outlier removal to all columns
for col in outlier_cols:
    data = remove_outliers(data, col)

# Reset index after removing outliers
data.reset_index(drop=True, inplace=True)

# Check updated data
data.info()


# ***7. Correlation Matrix***

In [None]:
data.isnull().sum()

In [None]:
# Correlation matrix
correlation_matrix = data.corr(numeric_only=True)

# Correlation heatmap visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap of Features")
plt.show()

# Drop highly correlated and less relevant features
data.drop(columns={"interior", "color", "transmission", "state", "trim"}, inplace=True)

# Display top 5 correlated features with selling price
target_corr = correlation_matrix["sellingprice"].sort_values(ascending=False)
top_corr_features = target_corr[1:6]

plt.figure(figsize=(8, 5))
top_corr_features.plot(kind="bar", color="skyblue")
plt.title("Top 5 Features Correlated with Selling Price")
plt.ylabel("Correlation Coefficient")
plt.xlabel("Features")
plt.xticks(rotation=45)
plt.show()


In [None]:
data

In [None]:
# removing features based on correlattion
#data.drop(columns={'interior','color','transmission','state','trim'},inplace=True)

# ***8. Log Transformation (Optional for Skewed Data)***

In [None]:
# Log transformation to reduce skewness in target variable
data["sellingprice"] = np.log1p(data["sellingprice"])

# Visualizing log-transformed target variable
plt.figure(figsize=(8, 6))
sns.histplot(data["sellingprice"], bins=50, kde=True, color="lightcoral")
plt.title("Log-Transformed Selling Price Distribution")
plt.xlabel("Log of Selling Price")
plt.ylabel("Frequency")
plt.show()


In [None]:
data.dtypes

# ***9. StandardScaler for Feature Scaling***

In [None]:
# Feature Scaling
scaler = StandardScaler()
data[["year", "make", "model", "body", "condition", "odometer", "mmr", "sellingprice"]] = scaler.fit_transform(
    data[["year", "make", "model", "body", "condition", "odometer", "mmr", "sellingprice"]]
)

# Define features and target variable
X = data.drop(["saledate", "sellingprice"], axis=1)
y = data["sellingprice"]
# Split data into training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# ***10. Model Training and Evaluation***

In [None]:
# Model initialization and training
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Model predictions
y_pred_log = rf_model.predict(X_test)

# Inverse log transformation to get actual predictions
y_pred_actual = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

# Model evaluation
mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)

print(f"Model Evaluation Results:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")



In [None]:
data.to_csv("preprocessed_data.csv",index=False)