In [1]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib
import os
import re
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Load Data
file_path = '../data/final_data.csv'
data = pd.read_csv(file_path)

# Encode Categorical Variables
categorical_cols = ['State', 'Make', 'Model', 'Exterior color', 'Interior color', 'Drivetrain', 'Fuel type', 'Transmission', 'Engine']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Normalize Numeric Features
scaler = StandardScaler()
data[['Mileage', 'Year', 'Price(USD)']] = scaler.fit_transform(data[['Mileage', 'Year', 'Price(USD)']])

# Save Label Encoders and Scaler
if not os.path.exists('models'):
    os.makedirs('models')
for col, le in label_encoders.items():
    joblib.dump(le, f'models/label_encoder_{col}.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

# Feature Selection for Model Training
features = ['State', 'Make', 'Model', 'Exterior color', 'Interior color', 'Drivetrain', 'Fuel type', 'Transmission', 'Engine', 'Mileage', 'Year', 'Price(USD)']
X = data[features]

# Train Nearest Neighbors Model
model = NearestNeighbors(n_neighbors=5, algorithm='auto')
model.fit(X)
joblib.dump(model, 'models/nearest_neighbors_model.pkl')

# Streamlit User Inputs for Car Details
st.title('Car Recommendation System')
st.sidebar.header('Car Details')

def get_user_input():
    user_data = {}
    for col in ['State', 'Exterior color', 'Interior color', 'Drivetrain', 'Fuel type', 'Transmission', 'Engine']:
        unique_values = list(label_encoders[col].classes_)
        user_value = st.sidebar.selectbox(f"Select {col}", options=["Skip"] + unique_values, index=0)
        if user_value != "Skip":
            user_data[col] = label_encoders[col].transform([user_value])[0]
        else:
            user_data[col] = np.nan

    user_data['Mileage'] = st.sidebar.number_input("Enter Mileage", min_value=0, value=10000, step=1000)
    user_data['Year'] = st.sidebar.number_input("Enter Year", min_value=1920, max_value=2025, value=2020, step=1)
    user_data['Price(USD)'] = st.sidebar.number_input("Enter Price (in USD)", min_value=0, value=20000, step=1000)

    return pd.DataFrame([user_data], columns=user_input_cols)

# Get User Input
df_user_input = get_user_input()

# Drop columns with all missing values
df_user_input = df_user_input.dropna(axis=1, how='all')

# Ensure at least one feature is provided
if df_user_input.empty:
    st.write('No valid inputs provided. Please provide at least one input for recommendation.')
else:
    # Fill missing values with column means or most frequent values if there are still inputs left
    df_user_input = df_user_input.apply(lambda x: x.fillna(data[x.name].mean()) if x.name in ['Mileage', 'Year', 'Price(USD)'] else x.fillna(data[x.name].mode()[0]))

    # Normalize User Input if numeric columns are present
    numeric_cols = ['Mileage', 'Year', 'Price(USD)']
    cols_to_normalize = [col for col in numeric_cols if col in df_user_input.columns]
    if cols_to_normalize:
        df_user_input[cols_to_normalize] = scaler.transform(df_user_input[cols_to_normalize])

    # Load Nearest Neighbors Model
    model = joblib.load('models/nearest_neighbors_model.pkl')

    # Add Make and Model columns with placeholder values if missing
    if 'Make' not in df_user_input.columns:
        df_user_input['Make'] = data['Make'].mode()[0]  # Use most frequent value
    if 'Model' not in df_user_input.columns:
        df_user_input['Model'] = data['Model'].mode()[0]  # Use most frequent value

    # Reorder columns to match training data
    missing_cols = set(features) - set(df_user_input.columns)
    for col in missing_cols:
        df_user_input[col] = 0  # Add missing columns with placeholder values

    df_user_input = df_user_input[features]

    # Find Similar Cars
    distances, indices = model.kneighbors(df_user_input)
    recommended_cars = data.iloc[indices[0]]
    recommended_cars['Similarity'] = 1 - distances[0]  # Similarity score (1 - distance)
    recommended_cars = recommended_cars.sort_values(by='Similarity', ascending=False)

    # Check if there are any recommended cars
    if recommended_cars.empty:
        st.write('No recommendations found within the specified threshold.')
    else:
        # Inverse Transform Categorical Columns
        for col in categorical_cols:
            recommended_cars[col] = label_encoders[col].inverse_transform(recommended_cars[col])

        # Inverse Transform Numeric Columns
        recommended_cars[['Mileage', 'Year', 'Price(USD)']] = scaler.inverse_transform(recommended_cars[['Mileage', 'Year', 'Price(USD)']])

        # Display Recommendations
        st.write('Top 5 Recommended Cars:')
        st.write(recommended_cars[['Make', 'Model', 'Year', 'Mileage', 'Price(USD)', 'Engine', 'Similarity']])


Enter State ['certified', 'new', 'used'] (or press Enter to skip):  
Enter Exterior color ['Black', 'Blue', 'Brown', 'Green', 'Orange', 'Other', 'Pink', 'Purple', 'Red', 'Silver', 'White', 'Yellow'] (or press Enter to skip):  
Enter Interior color ['Black', 'Blue', 'Brown', 'Green', 'Orange', 'Other', 'Red', 'Silver', 'White', 'Yellow'] (or press Enter to skip):  
Enter Drivetrain ['4WD', 'AWD', 'FWD', 'RWD', 'Unknown'] (or press Enter to skip):  
Enter Fuel type ['CNG', 'Diesel', 'Electric', 'Flex Fuel', 'Gasoline', 'Hybrid', 'Other', 'Premium'] (or press Enter to skip):  
Enter Transmission ['Automatic', 'CVT', 'Manual', 'Other'] (or press Enter to skip):  
Enter Engine ['1.8L', '1.9L', '1L', '2.2L', '2.3L', '2.7L', '2.8L', '2.9L', '2L', '3-Cylinder', '3.1L', '3.3L', '3.4L', '3.7L', '3.8L', '3.9L', '3L', '4-Cylinder', '4.0L', '4.1L', '4.2L', '4.3L', '4.4L', '4.5L', '4.6L', '4.7L', '4.8L', '4.9L', '4L', '5-Cylinder', '5.0L', '5.2L', '5.3L', '5.4L', '5.5L', '5.6L', '5.7L', '5.8L', '5.9

Top 5 Recommended Cars:
            Make                Model    Year   Mileage  Price(USD) Engine  \
7929        Jeep            CJ-7 Base  1981.0   22887.0     16500.0   4.2L   
6583   Chevrolet      C20/K20 3/4 Ton  1969.0   78697.0     19999.0  Other   
13286    Porsche         Boxster Base  2000.0   70620.0     12995.0   2.7L   
8824   Chevrolet  Bel Air Convertible  1955.0   55053.0     99900.0  Other   
3654         Kia        Borrego LX V6  2009.0  100170.0      5450.0   3.8L   

       Similarity  
7929  -293.797887  
6583  -294.172262  
13286 -294.632641  
8824  -294.672142  
3654  -294.700711  
