<a href="https://colab.research.google.com/github/nagayahita/ml-data-science-portfolio/blob/main/car_predict_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Car Market Analysis and Price Prediction**

In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from typing import Tuple, Dict, List
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [9]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print("=== Training Data Info ===")
print(f"Shape: {train_data.shape}")
print("\nColumns:")
for col in train_data.columns:
    print(f"- {col}: {train_data[col].dtype}")

print("\n=== Missing Values ===")
print(train_data.isnull().sum())

=== Training Data Info ===
Shape: (19237, 18)

Columns:
- ID: int64
- Price: int64
- Levy: object
- Manufacturer: object
- Model: object
- Prod. year: int64
- Category: object
- Leather interior: object
- Fuel type: object
- Engine volume: object
- Mileage: object
- Cylinders: float64
- Gear box type: object
- Drive wheels: object
- Doors: object
- Wheel: object
- Color: object
- Airbags: int64

=== Missing Values ===
ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64


In [10]:
class DataPreprocessor:
    """
    Class for cleaning and preprocessing car data with robust error handling.
    """
    def __init__(self):
        self.errors = []
        self.warnings = []

    def clean_numeric_field(self, value: str, field_type: str) -> float:
        """Clean numeric fields with specific handling for different types."""
        try:
            if pd.isna(value):
                return 0.0

            if field_type == 'mileage':
                return float(str(value).replace('km', '').strip())

            elif field_type == 'engine':
                # Handle cases like "2.0 Turbo"
                numeric_part = ''.join(char for char in str(value) if char.isdigit() or char == '.')
                return float(numeric_part) if numeric_part else 0.0

            elif field_type == 'price':
                return float(value)

        except Exception as e:
            self.errors.append(f"Error cleaning {field_type} value '{value}': {str(e)}")
            return 0.0

    def clean_categorical_field(self, value: str, field_type: str) -> str:
        """Clean and standardize categorical fields."""
        try:
            if pd.isna(value):
                return 'Unknown'

            value = str(value).strip().upper()

            if field_type == 'manufacturer':
                # Standardize manufacturer names
                manufacturer_map = {
                    'VW': 'VOLKSWAGEN',
                    'MERCEDES': 'MERCEDES-BENZ'
                }
                return manufacturer_map.get(value, value)

            elif field_type == 'fuel_type':
                # Standardize fuel types
                fuel_map = {
                    'PETROL': 'GASOLINE',
                    'DIESEL': 'DIESEL',
                    'HYBRID': 'HYBRID',
                    'ELECTRIC': 'ELECTRIC'
                }
                return fuel_map.get(value, value)

            return value

        except Exception as e:
            self.errors.append(f"Error cleaning {field_type} value '{value}': {str(e)}")
            return 'Unknown'

    def process_dataset(self, df: pd.DataFrame, is_training: bool = True) -> pd.DataFrame:
        """Process entire dataset with comprehensive cleaning and feature engineering."""
        try:
            processed_df = df.copy()

            # Clean numeric fields
            processed_df['Mileage_Clean'] = processed_df['Mileage'].apply(
                lambda x: self.clean_numeric_field(x, 'mileage')
            )
            processed_df['Engine_Volume_Clean'] = processed_df['Engine volume'].apply(
                lambda x: self.clean_numeric_field(x, 'engine')
            )

            # Clean categorical fields
            processed_df['Manufacturer_Clean'] = processed_df['Manufacturer'].apply(
                lambda x: self.clean_categorical_field(x, 'manufacturer')
            )
            processed_df['Fuel_Type_Clean'] = processed_df['Fuel type'].apply(
                lambda x: self.clean_categorical_field(x, 'fuel_type')
            )

            # Feature engineering
            current_year = datetime.now().year
            processed_df['Vehicle_Age'] = current_year - processed_df['Prod. year']
            processed_df['Is_Luxury'] = processed_df['Leather interior'].map({'Yes': 1, 'No': 0})

            if is_training:
                # Additional features for training data
                processed_df['Price_Per_Year'] = processed_df['Price'] / processed_df['Vehicle_Age']
                processed_df['Price_Per_Mile'] = processed_df['Price'] / processed_df['Mileage_Clean']

            return processed_df

        except Exception as e:
            logger.error(f"Error processing dataset: {str(e)}")
            raise

In [40]:
class CarMarketAnalyzer:
    """
    Advanced car market analysis with interactive visualizations and insights.
    """
    def __init__(self, data: pd.DataFrame):
        self.data = data
        self.market_insights = {}
        self._calculate_market_metrics()

    def _calculate_market_metrics(self):
        """Calculate key market metrics and store them."""
        try:
            # Price metrics
            self.market_insights['price_metrics'] = {
                'avg_price': self.data['Price'].mean(),
                'median_price': self.data['Price'].median(),
                'min_price': self.data['Price'].min(),
                'max_price': self.data['Price'].max(),
                'price_std': self.data['Price'].std()
            }

            # Manufacturer metrics
            manufacturer_stats = self.data.groupby('Manufacturer_Clean').agg({
                'Price': ['mean', 'count', 'std'],
                'Mileage_Clean': 'mean',
                'Vehicle_Age': 'mean'
            }).round(2)

            self.market_insights['manufacturer_stats'] = manufacturer_stats

        except Exception as e:
            logger.error(f"Error calculating market metrics: {str(e)}")
            raise

    def create_market_overview(self) -> Dict:
        """Generate comprehensive market overview."""
        try:
            overview = {
                "Market Size": {
                    "Total Vehicles": len(self.data),
                    "Total Manufacturers": self.data['Manufacturer_Clean'].nunique(),
                    "Total Categories": self.data['Category'].nunique()
                },
                "Price Overview": {
                    "Average Price": f"${self.market_insights['price_metrics']['avg_price']:,.2f}",
                    "Median Price": f"${self.market_insights['price_metrics']['median_price']:,.2f}",
                    "Price Range": f"${self.market_insights['price_metrics']['min_price']:,.2f} - "
                                 f"${self.market_insights['price_metrics']['max_price']:,.2f}"
                },
                "Top Manufacturers": self._get_top_manufacturers(),
                "Popular Categories": self._get_category_insights()
            }
            return overview

        except Exception as e:
            logger.error(f"Error creating market overview: {str(e)}")
            raise

    def _get_top_manufacturers(self, top_n: int = 5) -> Dict:
        """Get insights about top manufacturers."""
        try:
            top_by_price = self.data.groupby('Manufacturer_Clean')['Price'].mean().nlargest(top_n)
            top_by_volume = self.data.groupby('Manufacturer_Clean').size().nlargest(top_n)

            return {
                "Most Expensive": {name: f"${price:,.2f}" for name, price in top_by_price.items()},
                "Most Popular": {name: count for name, count in top_by_volume.items()}
            }

        except Exception as e:
            logger.error(f"Error getting top manufacturers: {str(e)}")
            raise

    def _get_category_insights(self) -> Dict:
        """Get insights about vehicle categories."""
        try:
            category_stats = self.data.groupby('Category').agg({
                'Price': ['mean', 'count']
            })

            return {
                "By Average Price": {
                    cat: price for cat, price in
                    category_stats['Price']['mean'].nlargest(5).round(2).items()
                },
                "By Popularity": {
                    cat: int(count) for cat, count in
                    category_stats['Price']['count'].nlargest(5).items()
                }
            }

        except Exception as e:
            logger.error(f"Error getting category insights: {str(e)}")
            raise

    def create_interactive_visualizations(self):
        """Create interactive visualizations using plotly."""
        try:
            # Price Distribution
            fig1 = px.histogram(self.data, x='Price',
                              title='Price Distribution',
                              template='plotly_white')
            fig1.show()

            # Price vs Year
            fig2 = px.scatter(self.data, x='Prod. year', y='Price',
                             color='Manufacturer_Clean',
                             title='Price vs Production Year',
                             template='plotly_white')
            fig2.show()

            # Price by Category
            fig3 = px.box(self.data, x='Category', y='Price',
                          title='Price Distribution by Category',
                          template='plotly_white')
            fig3.show()

        except Exception as e:
            logger.error(f"Error creating visualizations: {str(e)}")
            raise

    def generate_price_recommendations(self, category: str = None, budget: float = None) -> pd.DataFrame: # Corrected indentation
        """Generate price recommendations based on category and budget."""
        try:
            recommendations = self.data.copy()

            if category:
                recommendations = recommendations[recommendations['Category'] == category]
            if budget:
                recommendations = recommendations[recommendations['Price'] <= budget]

            if len(recommendations) == 0:
                logger.warning(f"No vehicles found matching criteria: category={category}, budget=${budget:,}")
                return pd.DataFrame()

            recommendations['Value_Score'] = (
                (1 - recommendations['Price'] / recommendations['Price'].max()) * 0.3 +  # Lower price is better
                (1 - recommendations['Mileage_Clean'] / recommendations['Mileage_Clean'].max()) * 0.3 +  # Lower mileage is better
                (1 - recommendations['Vehicle_Age'] / recommendations['Vehicle_Age'].max()) * 0.2 +  # Newer is better
                (recommendations['Is_Luxury'].fillna(0) * 0.2)

            )

            top_picks = recommendations.nlargest(5, 'Value_Score')[[
                'Manufacturer_Clean', 'Model', 'Price', 'Prod. year',
                'Mileage_Clean', 'Category', 'Fuel_Type_Clean', 'Value_Score'
            ]]

            return top_picks

        except Exception as e:
            logger.error(f"Error generating recommendations: {str(e)}")
            raise

In [41]:
preprocessor = DataPreprocessor()

# Process data
processed_train = preprocessor.process_dataset(train_data, is_training=True)
processed_test = preprocessor.process_dataset(test_data, is_training=False)

# Show sample of processed data
print("Sample of processed training data:")
display(processed_train.head())

Sample of processed training data:


Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,...,Color,Airbags,Mileage_Clean,Engine_Volume_Clean,Manufacturer_Clean,Fuel_Type_Clean,Vehicle_Age,Is_Luxury,Price_Per_Year,Price_Per_Mile
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,...,Silver,12,186005.0,3.5,LEXUS,HYBRID,15,1,888.533333,0.071654
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,...,Black,8,192000.0,3.0,CHEVROLET,GASOLINE,14,0,1187.214286,0.086568
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,...,Black,2,200000.0,1.3,HONDA,GASOLINE,19,0,445.631579,0.042335
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,...,White,0,168966.0,2.5,FORD,HYBRID,14,1,257.642857,0.021347
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,...,Silver,4,91901.0,1.3,HONDA,GASOLINE,11,1,1066.0,0.127594


In [42]:
analyzer = CarMarketAnalyzer(processed_train)

# Get market overview
overview = analyzer.create_market_overview()

# Display results with emojis and formatting
print("=== 🚗 MARKET ANALYSIS REPORT ===")
print("\n📊 MARKET SIZE:")
for key, value in overview["Market Size"].items():
    print(f"• {key}: {value:,}")

print("\n💰 PRICE OVERVIEW:")
for key, value in overview["Price Overview"].items():
    print(f"• {key}: {value}")

print("\n🏆 TOP MANUFACTURERS BY PRICE:")
for name, price in overview["Top Manufacturers"]["Most Expensive"].items():
    print(f"• {name}: {price}")

print("\n📈 POPULAR MANUFACTURERS BY VOLUME:")
for name, count in overview["Top Manufacturers"]["Most Popular"].items():
    print(f"• {name}: {count:,} units")

print("\n🚙 TOP CATEGORIES BY PRICE:")
for category, price in overview["Popular Categories"]["By Average Price"].items():
    print(f"• {category}: ${price:,.2f}")

=== 🚗 MARKET ANALYSIS REPORT ===

📊 MARKET SIZE:
• Total Vehicles: 19,237
• Total Manufacturers: 65
• Total Categories: 11

💰 PRICE OVERVIEW:
• Average Price: $18,555.93
• Median Price: $13,172.00
• Price Range: $1.00 - $26,307,500.00

🏆 TOP MANUFACTURERS BY PRICE:
• LAMBORGHINI: $872,946.00
• BENTLEY: $197,574.50
• OPEL: $73,305.62
• FERRARI: $66,955.50
• LAND ROVER: $54,053.49

📈 POPULAR MANUFACTURERS BY VOLUME:
• HYUNDAI: 3,769 units
• TOYOTA: 3,662 units
• MERCEDES-BENZ: 2,076 units
• FORD: 1,111 units
• CHEVROLET: 1,069 units

🚙 TOP CATEGORIES BY PRICE:
• Goods wagon: $122,916.18
• Pickup: $28,805.27
• Universal: $25,253.70
• Cabriolet: $24,583.69
• Jeep: $23,684.29


In [43]:
analyzer.create_interactive_visualizations()

In [45]:
print("=== PRICE RECOMMENDATIONS ===\n")

def display_recommendations(recommendations: pd.DataFrame, scenario: str):
    """Helper function to display recommendations in a formatted way"""
    print(f"\n{scenario}:")
    if recommendations.empty:
        print("No recommendations found matching criteria")
    else:
        for idx, car in recommendations.iterrows():
            print(f"\n{idx+1}. {car['Manufacturer_Clean']} {car['Model']}")
            print(f"   💰 Price: ${car['Price']:,.2f}")
            print(f"   📅 Year: {car['Prod. year']}")
            print(f"   🚗 Category: {car['Category']}")
            print(f"   ⛽ Fuel Type: {car['Fuel_Type_Clean']}")
            print(f"   📊 Value Score: {car['Value_Score']:.2f}")

# Luxury vehicles under 50k
luxury_recs = analyzer.generate_price_recommendations(
    category="Luxury",
    budget=50000
)
display_recommendations(luxury_recs, "Luxury Vehicles Under $50,000")

# Economic vehicles under 15k
economic_recs = analyzer.generate_price_recommendations(
    budget=15000
)
display_recommendations(economic_recs, "Economic Vehicles Under $15,000")

# SUVs under 30k
suv_recs = analyzer.generate_price_recommendations(
    category="SUV",
    budget=30000
)
display_recommendations(suv_recs, "SUVs Under $30,000")



=== PRICE RECOMMENDATIONS ===


Luxury Vehicles Under $50,000:
No recommendations found matching criteria

Economic Vehicles Under $15,000:

7277. MERCEDES-BENZ G 55 AMG
   💰 Price: $3.00
   📅 Year: 2020
   🚗 Category: Jeep
   ⛽ Fuel Type: LPG
   📊 Value Score: 0.99

13689. HYUNDAI Santa FE
   💰 Price: $110.00
   📅 Year: 2019
   🚗 Category: Jeep
   ⛽ Fuel Type: GASOLINE
   📊 Value Score: 0.98

1249. TOYOTA CHR
   💰 Price: $188.00
   📅 Year: 2019
   🚗 Category: Jeep
   ⛽ Fuel Type: GASOLINE
   📊 Value Score: 0.98

891. TOYOTA Camry
   💰 Price: $251.00
   📅 Year: 2019
   🚗 Category: Sedan
   ⛽ Fuel Type: HYBRID
   📊 Value Score: 0.98

915. TOYOTA Camry
   💰 Price: $251.00
   📅 Year: 2019
   🚗 Category: Sedan
   ⛽ Fuel Type: HYBRID
   📊 Value Score: 0.98

SUVs Under $30,000:
No recommendations found matching criteria
