In [1]:
import pandas as pd
import numpy as np

clean_car_features = pd.read_csv('../datasets/clean_car_features.csv')

In [2]:
# Feature Selection

useful_features = ["Make", "Model", "Year", "Engine Fuel Type", "Transmission Type", "Driven_Wheels", "Market Category", "Vehicle Size", "Vehicle Style","Engine HP", "Engine Cylinders", "Number of Doors", "MSRP"]

# Defining all Numerical Features
numerical_features = ["Year", "Engine HP", "Engine Cylinders", "Number of Doors", "MSRP"]


In [3]:
# Text Data Processing
import string

# Function to clean and preprocess a text
def preprocess_text(text):
    # Step 1: Convert to lowercase
    text = text.lower()
    
    # Step 2: Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Step 3: Remove words containing numbers
    text = ' '.join([word for word in text.split() if not any(char.isdigit() for char in word)])
    
    # Step 4: Remove stop words (You would need a list of stop words for this step)
    # Example: text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

processed_car_features = clean_car_features.copy()  # Copy the original DataFrame

# Replace the ',' with a space in the Market Category column
processed_car_features['Market Category'] = processed_car_features['Market Category'].str.replace(',', ' ')

# Apply the preprocessing function to each row of the DataFrame
processed_car_features['combined_text'] = processed_car_features[useful_features].apply(lambda x: ' '.join(x.astype(str)), axis=1)
processed_car_features['combined_text'] = processed_car_features['combined_text'].apply(preprocess_text)


In [4]:
processed_car_features['combined_text'].head().values

array(['bmw series m premium unleaded required manual rear wheel drive factory tuner luxury highperformance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury performance compact convertible',
       'bmw series premium unleaded required manual rear wheel drive luxury highperformance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury performance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury compact convertible'],
      dtype=object)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tf_idf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to get the TF-IDF matrix
tf_idf_matrix = tf_idf_vectorizer.fit_transform(processed_car_features['combined_text'])

# The resulting tfidf_matrix is a sparse matrix containing the TF-IDF values for each term in the text.


In [6]:
from scipy.sparse import hstack

# Combine numerical features and TF-IDF vectors
combined_features = pd.concat([processed_car_features[numerical_features], pd.DataFrame(tf_idf_matrix.toarray())], axis=1)

# If you want to stack sparse matrices horizontally, you can use hstack
# combined_features = hstack((processed_car_features[numerical_features], tf_idf_matrix))

# Now, combined_features contains both the numerical features and TF-IDF vectors.
combined_features


Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,MSRP,0,1,2,3,4,...,646,647,648,649,650,651,652,653,654,655
0,2011,335.0,6.0,2.0,46135,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,2011,300.0,6.0,2.0,40650,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
2,2011,300.0,6.0,2.0,36350,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,2011,230.0,6.0,2.0,29450,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,2011,230.0,6.0,2.0,34500,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,2012,300.0,6.0,4.0,46120,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11910,2012,300.0,6.0,4.0,56670,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11911,2012,300.0,6.0,4.0,50620,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11912,2013,300.0,6.0,4.0,50920,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.621218,0.000000


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features)

# The resulting similarity_matrix will contain pairwise cosine similarity scores for all car items.

# convert the similarity matrix to a DataFrame for better readability(optional)
similarity_df = pd.DataFrame(similarity_matrix, columns=range(len(combined_features)), index=range(len(combined_features)))
#  Now, you can use similarity_matrix or similarity_df for recommendations or other similarity-based tasks


In [8]:
# Define a function to generate recommendations for a given car
def generate_recommendations(car_id, combined_features, num_recommendations=5):
    # Calculate similarity to all other cars
    similarity_scores = cosine_similarity(combined_features)
    
    # Get the similarity scores for the given car
    car_similarity_scores = similarity_scores[car_id]
    
    # Sort cars based on similarity scores (in descending order)
    sorted_indices = car_similarity_scores.argsort()[::-1]
    
    # Select the top N cars as recommendations
    top_recommendations = sorted_indices[1:num_recommendations + 1]  # Exclude the given car
    
    return top_recommendations

# Example: Generate recommendations for a specific car (assuming car_id is the index of the car)
car_id = 0  # index of the car you want recommendations for
num_recommendations = 5  # Number of recommendations to generate

# Generate recommendations for the given car
recommendations = generate_recommendations(car_id, combined_features, num_recommendations)

# Get the details of recommended cars from the original dataset
car_features = pd.read_csv('../datasets/clean_car_features.csv')
recommended_cars = car_features.iloc[recommendations]
recommended_cars


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
50,BMW,2 Series,2017,premium unleaded (recommended),335.0,6.0,AUTOMATIC,all wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,31,21,3916,46450
7835,Infiniti,Q60 Coupe,2014,premium unleaded (recommended),330.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Midsize,Coupe,25,17,190,46050
5114,Infiniti,G Sedan,2012,premium unleaded (recommended),328.0,6.0,AUTOMATIC,all wheel drive,4.0,Luxury,Midsize,Sedan,25,18,190,46050
7838,Infiniti,Q60 Coupe,2015,premium unleaded (recommended),330.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Midsize,Coupe,25,17,190,46050
1562,Cadillac,ATS,2017,regular unleaded,335.0,6.0,AUTOMATIC,all wheel drive,4.0,"Luxury,High-Performance",Midsize,Sedan,27,19,1624,45995


In [9]:
# Define a function to generate recommendations based on keywords
def generate_recommendations_based_on_keywords(keywords, combined_features, num_recommendations=5):
    # Calculate similarity to all other cars
    similarity_scores = cosine_similarity(combined_features)
    
    # Create a mapping from keywords to feature indices
    feature_columns = combined_features.columns
    feature_to_index = {feature: index for index, feature in enumerate(feature_columns)}
    
    # Create a dummy car feature vector based on the provided keywords
    dummy_car_feature = [0] * len(combined_features.columns)
    for keyword in keywords:
        # Assign a value to the corresponding feature based on the keyword
        if keyword in feature_to_index:
            index = feature_to_index[keyword]
            dummy_car_feature[index] = 1  # You can use a different value if needed
    
    # Calculate similarity between the dummy car feature and all other cars
    dummy_car_similarity_scores = cosine_similarity([dummy_car_feature], combined_features)
    
    # Get the similarity scores for the dummy car
    dummy_car_similarity_scores = dummy_car_similarity_scores[0]
    
    # Sort cars based on similarity scores (in descending order)
    sorted_indices = dummy_car_similarity_scores.argsort()[::-1]
    
    # Select the top N cars as recommendations
    top_recommendations = sorted_indices[:num_recommendations]
    
    return top_recommendations

# Example: Generate recommendations based on keywords
keywords = ['luxury', 'high-performance', 'compact']
num_recommendations = 5  # Number of recommendations to generate

# Generate recommendations based on keywords
recommendations = generate_recommendations_based_on_keywords(keywords, combined_features, num_recommendations)

# Get the details of recommended cars from the original dataset
car_features = pd.read_csv('../datasets/clean_car_features.csv')
recommended_cars = car_features.iloc[recommendations]
recommended_cars


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
11913,Lincoln,Zephyr,2006,regular unleaded,221.0,6.0,AUTOMATIC,front wheel drive,4.0,Luxury,Midsize,Sedan,26,17,61,28995
3968,Mitsubishi,Endeavor,2010,premium unleaded (recommended),225.0,6.0,AUTOMATIC,all wheel drive,4.0,Crossover,Midsize,4dr SUV,19,15,436,31499
3977,Hyundai,Entourage,2008,regular unleaded,250.0,6.0,AUTOMATIC,front wheel drive,4.0,UNKNOWN,Midsize,Passenger Minivan,23,16,1439,23995
3976,Hyundai,Entourage,2008,regular unleaded,250.0,6.0,AUTOMATIC,front wheel drive,4.0,UNKNOWN,Midsize,Passenger Minivan,23,16,1439,29895
3975,Hyundai,Entourage,2007,regular unleaded,242.0,6.0,AUTOMATIC,front wheel drive,4.0,UNKNOWN,Midsize,Passenger Minivan,23,16,1439,23895


# Summary

I noticed the results aren't very good, however they are passable. Key words are missing, but the recommendations are comparable to the car that was inputted. I think this is a good start, but there is a lot of room for improvement. I think the next step would be to add more features to the dataset and then try to recommend based on those features, or to be more strict on the columns I should use.
