In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tf_idf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to get the TF-IDF matrix
tf_idf_matrix = tf_idf_vectorizer.fit_transform(processed_car_features['combined_text'])

# The resulting tfidf_matrix is a sparse matrix containing the TF-IDF values for each term in the text.
# You can convert it to a dense array if needed: tfidf_matrix.toarray()

# Now, you can use tfidf_matrix as input for machine learning models, clustering, or other text analysis tasks.


In [None]:
from scipy.sparse import hstack

# Combine numerical features and TF-IDF vectors
combined_features = pd.concat([processed_car_features[numerical_features], pd.DataFrame(tf_idf_matrix.toarray())], axis=1)

# If you want to stack sparse matrices horizontally, you can use hstack
# combined_features = hstack((processed_car_features[numerical_features], tf_idf_matrix))

# Now, combined_features contains both the numerical features and TF-IDF vectors.
combined_features


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features)

# The resulting similarity_matrix will contain pairwise cosine similarity scores for all car items.

# Optionally, you can convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, columns=range(len(combined_features)), index=range(len(combined_features)))
#  Now, you can use similarity_matrix or similarity_df for recommendations or other similarity-based tasks


In [None]:
# Define a function to generate recommendations for a given car
def generate_recommendations(car_id, combined_features, num_recommendations=5):
    # Calculate similarity to all other cars
    similarity_scores = cosine_similarity(combined_features)
    
    # Get the similarity scores for the given car
    car_similarity_scores = similarity_scores[car_id]
    
    # Sort cars based on similarity scores (in descending order)
    sorted_indices = car_similarity_scores.argsort()[::-1]
    
    # Select the top N cars as recommendations
    top_recommendations = sorted_indices[1:num_recommendations + 1]  # Exclude the given car
    
    return top_recommendations

# Example: Generate recommendations for a specific car (assuming car_id is the index of the car)
car_id = 0  # Change this to the index of the car you want recommendations for
num_recommendations = 5  # Number of recommendations to generate

# Generate recommendations for the given car
recommendations = generate_recommendations(car_id, combined_features, num_recommendations)

# Get the details of recommended cars (assuming car_data is the original dataset)
car_features = pd.read_csv('../datasets/clean_car_features.csv')
recommended_cars = car_features.iloc[recommendations]
