In [31]:
import pandas as pd
import numpy as np

clean_car_features = pd.read_csv('../datasets/clean_car_features.csv')

In [32]:
# Feature Selection
#  "Make," "Model," "Year," "Engine Fuel Type," "Transmission Type," "Driven_Wheels," "Market Category," "Vehicle Size," and "Vehicle Style"

useful_features = ["Make", "Model", "Year", "Engine Fuel Type", "Transmission Type", "Driven_Wheels", "Market Category", "Vehicle Size", "Vehicle Style","Engine HP", "Engine Cylinders", "Number of Doors", "MSRP"]
numerical_features = ["Year", "Engine HP", "Engine Cylinders", "Number of Doors", "MSRP"]


In [33]:
# Text Data Processing
import string

# Function to clean and preprocess a text
def preprocess_text(text):
    # Step 1: Convert to lowercase
    text = text.lower()
    
    # Step 2: Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Step 3: Remove words containing numbers
    text = ' '.join([word for word in text.split() if not any(char.isdigit() for char in word)])
    
    # Step 4: Remove stop words (You would need a list of stop words for this step)
    # Example: text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

processed_car_features = clean_car_features.copy()  # Copy the original DataFrame

# Replace the ',' with a space in the Market Category column
processed_car_features['Market Category'] = processed_car_features['Market Category'].str.replace(',', ' ')

# Apply the preprocessing function to each row of the DataFrame
processed_car_features['combined_text'] = processed_car_features[useful_features].apply(lambda x: ' '.join(x.astype(str)), axis=1)
processed_car_features['combined_text'] = processed_car_features['combined_text'].apply(preprocess_text)


In [34]:
processed_car_features['combined_text'].head().values

array(['bmw series m premium unleaded required manual rear wheel drive factory tuner luxury highperformance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury performance compact convertible',
       'bmw series premium unleaded required manual rear wheel drive luxury highperformance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury performance compact coupe',
       'bmw series premium unleaded required manual rear wheel drive luxury compact convertible'],
      dtype=object)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tf_idf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to get the TF-IDF matrix
tf_idf_matrix = tf_idf_vectorizer.fit_transform(processed_car_features['combined_text'])

# The resulting tfidf_matrix is a sparse matrix containing the TF-IDF values for each term in the text.
# You can convert it to a dense array if needed: tfidf_matrix.toarray()

# Now, you can use tfidf_matrix as input for machine learning models, clustering, or other text analysis tasks.


In [36]:
from scipy.sparse import hstack

# Combine numerical features and TF-IDF vectors
combined_features = pd.concat([processed_car_features[numerical_features], pd.DataFrame(tf_idf_matrix.toarray())], axis=1)

# If you want to stack sparse matrices horizontally, you can use hstack
# combined_features = hstack((processed_car_features[numerical_features], tf_idf_matrix))

# Now, combined_features contains both the numerical features and TF-IDF vectors.
combined_features


Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,MSRP,0,1,2,3,4,...,646,647,648,649,650,651,652,653,654,655
0,2011,335.0,6.0,2.0,46135,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,2011,300.0,6.0,2.0,40650,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
2,2011,300.0,6.0,2.0,36350,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,2011,230.0,6.0,2.0,29450,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,2011,230.0,6.0,2.0,34500,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,2012,300.0,6.0,4.0,46120,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11910,2012,300.0,6.0,4.0,56670,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11911,2012,300.0,6.0,4.0,50620,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624043,0.000000
11912,2013,300.0,6.0,4.0,50920,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.621218,0.000000
