# Used Car Price Analysis
## Analyzing Selling Price of Used Cars using Python

## Step 1: Install Required Libraries
Run this cell to install all necessary libraries (uncomment if needed)

In [None]:
# !pip install pandas numpy matplotlib seaborn scipy

## Step 2: Import Required Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## Step 3: Load the Dataset
**Note:** Make sure you have downloaded 'imports-85.data' file in the same directory

In [None]:
# Load the dataset
df = pd.read_csv('imports-85.data', header=None)

# Remove the first column (index column)
df = df.iloc[:, 1:]

# Display first 5 rows
print("First 5 rows of the dataset:")
df.head()

## Step 4: Assign Column Headers

In [None]:
headers = ["symboling", "normalized-losses", "make", 
           "fuel-type", "aspiration", "num-of-doors",
           "body-style", "drive-wheels", "engine-location",
           "wheel-base", "length", "width", "height", "curb-weight",
           "engine-type", "num-of-cylinders", "engine-size", 
           "fuel-system", "bore", "stroke", "compression-ratio",
           "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]

df.columns = headers
print("Dataset with column headers:")
df.head()

## Step 5: Check Dataset Information

In [None]:
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()
print("\nBasic statistics:")
df.describe()

## Step 6: Check for Missing Values

In [None]:
data = df.copy()

# Check for missing values (represented as '?')
print("Columns with missing values ('?'):")
for column in data.columns:
    if data[column].dtype == 'object':
        missing_count = (data[column] == '?').sum()
        if missing_count > 0:
            print(f"{column}: {missing_count} missing values")

## Step 7: Convert MPG to L/100km

In [None]:
# Convert city-mpg to L/100km (235 is the conversion factor)
data['city-mpg'] = 235 / data['city-mpg']

# Rename the column
data.rename(columns={'city-mpg': 'city-L/100km'}, inplace=True)

# Convert highway-mpg to L/100km
data['highway-mpg'] = 235 / data['highway-mpg']
data.rename(columns={'highway-mpg': 'highway-L/100km'}, inplace=True)

print("Updated columns:")
print(data.columns.tolist())
print("\nData types after conversion:")
print(data.dtypes)

## Step 8: Clean and Convert Price Column

In [None]:
# Check unique values in price column
print("Unique values in price column:")
print(data.price.unique())

# Remove rows with '?' in price column
data = data[data.price != '?']

# Convert price to integer
data['price'] = data['price'].astype(int)

# Convert other numeric columns that might have '?'
numeric_columns = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm']
for col in numeric_columns:
    data = data[data[col] != '?']
    data[col] = data[col].astype(float)

print("\nData types after cleaning:")
print(data.dtypes)
print(f"\nDataset shape after cleaning: {data.shape}")

## Step 9: Normalize Features and Create Price Categories

In [None]:
# Normalize length, width, height
data['length'] = data['length'] / data['length'].max()
data['width'] = data['width'] / data['width'].max()
data['height'] = data['height'] / data['height'].max()

# Binning - group prices into categories
bins = np.linspace(min(data['price']), max(data['price']), 4)
group_names = ['Low', 'Medium', 'High']
data['price-binned'] = pd.cut(data['price'], bins, 
                              labels=group_names, 
                              include_lowest=True)

# Display price categories distribution
print("Price categories distribution:")
print(data['price-binned'].value_counts())

# Visualize price categories
plt.figure(figsize=(8, 5))
data['price-binned'].value_counts().plot(kind='bar')
plt.title('Distribution of Price Categories')
plt.xlabel('Price Category')
plt.ylabel('Count')
plt.show()

## Step 10: Convert Categorical Data to Numerical (One-Hot Encoding)

In [None]:
# Example of one-hot encoding for fuel-type
fuel_dummies = pd.get_dummies(data['fuel-type'])
print("One-hot encoding for fuel-type (first 5 rows):")
print(fuel_dummies.head())

# Display dataset statistics
print("\nDataset statistics after processing:")
data.describe()

## Step 11: Data Visualization

In [None]:
# Box plot of prices
plt.figure(figsize=(10, 6))
plt.boxplot(data['price'])
plt.title('Box Plot of Car Prices')
plt.ylabel('Price ($)')
plt.grid(True)
plt.show()

print(f"Price Statistics:")
print(f"Minimum: ${data['price'].min():,.2f}")
print(f"Maximum: ${data['price'].max():,.2f}")
print(f"Mean: ${data['price'].mean():,.2f}")
print(f"Median: ${data['price'].median():,.2f}")

In [None]:
# Box plot of price by drive-wheels
plt.figure(figsize=(10, 6))
sns.boxplot(x='drive-wheels', y='price', data=data)
plt.title('Price Distribution by Drive Wheels Type')
plt.xlabel('Drive Wheels')
plt.ylabel('Price ($)')
plt.show()

In [None]:
# Scatter plot of engine-size vs price
plt.figure(figsize=(10, 6))
plt.scatter(data['engine-size'], data['price'], alpha=0.6)
plt.title('Scatterplot of Engine Size vs Price')
plt.xlabel('Engine Size')
plt.ylabel('Price ($)')
plt.grid(True)
plt.show()

# Calculate correlation
correlation = data['engine-size'].corr(data['price'])
print(f"Correlation between engine size and price: {correlation:.3f}")

## Step 12: Group Data by Drive-Wheels and Body-Style

In [None]:
# Group by drive-wheels and body-style, calculate mean price
test = data[['drive-wheels', 'body-style', 'price']]
data_grp = test.groupby(['drive-wheels', 'body-style'], 
                         as_index=False).mean()

print("Average price by drive-wheels and body-style:")
data_grp

## Step 13: Create Pivot Table and Heatmap

In [None]:
# Create pivot table
data_pivot = data_grp.pivot(index='drive-wheels',
                            columns='body-style',
                            values='price')

print("Pivot Table - Average Prices:")
print(data_pivot)

# Create heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data_pivot, annot=True, fmt='.0f', cmap='RdBu_r', center=data['price'].mean())
plt.title('Average Car Prices by Drive Wheels and Body Style')
plt.tight_layout()
plt.show()

## Step 14: Perform ANOVA Test

In [None]:
# Prepare data for ANOVA
data_annova = data[['make', 'price']]
grouped_annova = data_annova.groupby(['make'])

# Compare Honda and Subaru prices
honda_prices = grouped_annova.get_group('honda')['price']
subaru_prices = grouped_annova.get_group('subaru')['price']

annova_results = sp.stats.f_oneway(honda_prices, subaru_prices)

print("ANOVA Test Results - Honda vs Subaru:")
print(f"F-statistic: {annova_results.statistic:.4f}")
print(f"P-value: {annova_results.pvalue:.4f}")

if annova_results.pvalue < 0.05:
    print("Conclusion: There is a significant difference in prices between Honda and Subaru")
else:
    print("Conclusion: There is no significant difference in prices between Honda and Subaru")

In [None]:
# Regression plot - engine size vs price
plt.figure(figsize=(10, 6))
sns.regplot(x='engine-size', y='price', data=data, scatter_kws={'alpha':0.5})
plt.title('Engine Size vs Price with Regression Line')
plt.xlabel('Engine Size')
plt.ylabel('Price ($)')
plt.ylim(0, )
plt.show()

## Summary Statistics and Insights

In [None]:
# Final summary
print("=" * 50)
print("ANALYSIS SUMMARY")
print("=" * 50)

# Top factors affecting price
print("\nTop Factors Affecting Car Prices:")
numeric_cols = data.select_dtypes(include=[np.number]).columns
correlations = data[numeric_cols].corr()['price'].sort_values(ascending=False)
for factor, corr in correlations[1:6].items():
    print(f"- {factor}: {corr:.3f} correlation with price")

# Most expensive makes
print("\nTop 5 Most Expensive Car Makes (Average):")
expensive_makes = data.groupby('make')['price'].mean().sort_values(ascending=False).head()
for make, price in expensive_makes.items():
    print(f"- {make}: ${price:,.2f}")

# Most affordable makes
print("\nTop 5 Most Affordable Car Makes (Average):")
affordable_makes = data.groupby('make')['price'].mean().sort_values(ascending=True).head()
for make, price in affordable_makes.items():
    print(f"- {make}: ${price:,.2f}")