# 🚗 Car Data Analysis and Preprocessing
This notebook contains complete solutions for analyzing and preprocessing a car dataset using pandas, matplotlib, seaborn, and scikit-learn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("cars.csv")  # Replace with your actual file
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
most_missing = df.isnull().sum().idxmax()
df.drop(columns=[most_missing], inplace=True)

In [None]:
categorical = df.select_dtypes(include=['object']).columns
df = df.dropna(subset=categorical)

In [None]:
numerical = df.select_dtypes(include=['number']).columns
df[df[numerical].isnull().any(axis=1)]

In [None]:
df_sorted = df.sort_values(by='price', ascending=False)
print("Most expensive:\n", df_sorted.head(1))
print("Least expensive:\n", df_sorted.tail(1))

In [None]:
def min_max(column):
    return column.min(), column.max()

In [None]:
print("Horsepower:", min_max(df['horsepower']))
print("Length:", min_max(df['length']))
print("Fuel Efficiency:", min_max(df['fuelefficiency']))

In [None]:
df[['price', 'sales', 'horsepower', 'fuelefficiency']].hist(figsize=(10,8))
plt.tight_layout()
plt.show()

In [None]:
sns.kdeplot(df['length'], shade=True)
plt.title("Density of Length")
plt.show()

In [None]:
df.groupby('manufacturer').size()

In [None]:
df.select_dtypes(include=['number']).head()

In [None]:
df['price'].corr(df['sales'])

In [None]:
sns.scatterplot(x='price', y='sales', data=df)
plt.show()

In [None]:
sns.pairplot(df[['price', 'sales', 'horsepower', 'fuelefficiency']])
plt.show()

In [None]:
sns.boxplot(x='manufacturer', y='sales', data=df)
plt.xticks(rotation=90)
plt.show()

In [None]:
for col in ['price', 'horsepower', 'fuelefficiency']:
    sns.boxplot(x='manufacturer', y=col, data=df)
    plt.xticks(rotation=90)
    plt.title(f'{col} vs Manufacturer')
    plt.show()

In [None]:
X = df.drop('sales', axis=1)
y = df['sales']

In [None]:
le = LabelEncoder()
for col in X.select_dtypes(include='object').columns:
    X[col] = le.fit_transform(X[col])

In [None]:
X = pd.get_dummies(X, columns=['vehicletype'], drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)