In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
#Read the full CSV file as a Pandas DataFrame
vehicle_df_clean = pd.read_csv('vehicles_clean_description.csv')
vehicle_df_clean.head()

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description
0,Dieppe,NB,2015,Honda,Civic,14000,170000,facebook.com/marketplace/item/3549035535373825...,***PREFERABLY TRADES***(would want something s...
1,Riverview,NB,2019,Honda,Civic,24999,61000,facebook.com/marketplace/item/255560547337016/...,"2019 Honda Civic EX Great fuel efficient car, ..."
2,Moncton,NB,2018,Honda,Civic,20495,149000,facebook.com/marketplace/item/263120729766969/...,"LOADED CIVIC EX! One year warranty at no cost,..."
3,Charlottetown,PE,2010,Honda,Civic,6500,159000,facebook.com/marketplace/item/249495091262857/...,New brake rotors and pad New battery .Winter t...
4,Halifax,NS,2011,Honda,Civic,9000,125000,facebook.com/marketplace/item/183227231277501/...,-Honda civic in amazing physical condition -A...


In [3]:
# Convert the 'Description' column to strings and replace NaN values with empty strings
vehicle_df_clean['Description'] = vehicle_df_clean['Description'].astype(str).fillna('')

In [4]:
# Tokenize the sentences using word_tokenize and then check for each word in the sentence if it matches any of 
# The target terms. If a match is found, we collect the 4 words before and after the matched word to form the matching_sentence.
# Only these sentences with a match and limited context are stored in the matching_sentences_dict dictionary.

#Create an empty list to store the matching results
matching_results_list = []

#Compile a regex pattern to match any of the target terms
target_terms = ['rebuilt', 'accident', 'clean', 'carfax', 'damage', 'car fax', 'car-fax', 'recertified']
pattern = re.compile(r'\b(?:' + '|'.join(target_terms) + r')\b', flags=re.IGNORECASE)

#Iterate through the DataFrame and find rows with the specified terms in the description
for index, row in vehicle_df_clean.iterrows():
    description = row['Description'].lower()
    url = row['URL']
    
    if pattern.search(description):
        sentences = sent_tokenize(description)
        matching_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            for i, word in enumerate(words):
                if pattern.search(word):
                    start_index = max(0, i - 4)
                    end_index = min(i + 5, len(words))
                    matching_sentence = ' '.join(words[start_index:end_index])
                    matching_sentences.append(matching_sentence)
        if matching_sentences:
            matching_result = {'Index': index, 'Sentences': matching_sentences, 'URL': url}
            matching_results_list.append(matching_result)

#Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(matching_results_list)

#Save the DataFrame as a CSV file
result_df.to_csv('matching_results2.csv', index=False)

# Manually go through the descriptions in the 'matching_results2.csv' one by one and assign a label of 0 or 1 to indicate whether the entry has the "rebuilt" status or not.

# Save the updated DataFrame to a new CSV file, which will serve as our training dataset.

In [5]:
#Read the CSV file as a Pandas DataFrame
df_filtered_matching_results = pd.read_csv('matching_results_filtered.csv', index_col='Index')
df_filtered_matching_results.head()

Unnamed: 0_level_0,Description,URL,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,['driven ( approximately ) clean title comes w...,facebook.com/marketplace/item/306105255150907/...,0
19,"['clean car fax , fresh']",facebook.com/marketplace/item/1545149072689077...,0
21,"['mag , bluetooth , carfax without accident.19...",facebook.com/marketplace/item/330387509316880/...,0
27,['but has never been accident.the interviews h...,facebook.com/marketplace/item/821401316043008/...,0
31,['4cyl | fwd | rebuilt title 197.000km 1.8l 4c...,facebook.com/marketplace/item/1475916109821523...,1


In [6]:
#Filter the entries in vehicle_df_clean based on the common index values
common_indexes = df_filtered_matching_results.index
filtered_vehicle_df_clean = vehicle_df_clean.loc[common_indexes]

#Merge the "Rebuilt" column from df_filtered_matching_results to filtered_vehicle_df_clean
filtered_vehicle_df_clean = filtered_vehicle_df_clean.merge(df_filtered_matching_results[['Rebuilt']], left_index=True, right_index=True, how='left')

filtered_vehicle_df_clean.head()

Unnamed: 0_level_0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11,Halifax,NS,2010,Honda,Civic,10800,100000,facebook.com/marketplace/item/306105255150907/...,HONDA CIVIC 2010 Only 105895 kms driven (appro...,0
19,Lunenburg Municipal District,NS,2019,Honda,Civic,29000,65000,facebook.com/marketplace/item/1545149072689077...,2019 Honda Civic Sport Hatchback New Condition...,0
21,Quebec,QC,2012,Honda,Civic,9991,194000,facebook.com/marketplace/item/330387509316880/...,"Honda Civic Ex, 2012, 1.8L automatic, sunroof,...",0
27,Sherbrooke,QC,2004,Honda,Civic,2500,175000,facebook.com/marketplace/item/821401316043008/...,Honda Civic 2004 Read well before Mercrien Sun...,0
31,Halifax,NS,2010,Honda,Civic,6990,197000,facebook.com/marketplace/item/1475916109821523...,2010 Honda Civic Sdn DX-G | 1.8L 4Cyl | FWD | ...,1


# Training a Natural Language Processing (NLP) model to predict whether a car is rebuilt or not based on the listing description

In [7]:
# Data Preprocessing
X = filtered_vehicle_df_clean['Description']
y = filtered_vehicle_df_clean['Rebuilt']

In [8]:
#Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) # You can adjust max_features if needed
X_tfidf = vectorizer.fit_transform(X)

In [9]:
#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [10]:
#Model Training (Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [11]:
#Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5925925925925926


In [12]:
# Define hyperparameter grid for Logistic Regression
param_grid = {
    'C': [10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize the Logistic Regression model
model = LogisticRegression()

# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the final model with the best hyperparameters on the full dataset
final_model = best_model.fit(X_tfidf, y)

5 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



In [13]:
# Make predictions on the test set
y_test_pred = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy of the best model: {accuracy}")

Accuracy of the best model: 1.0


In [14]:
# Step 6: Create a DataFrame with Predicted and Actual Values
df_predictions = pd.DataFrame({'Description': X_test, 'Actual': y_test, 'Predicted': y_test_pred})

df_predictions.sample(20)

Unnamed: 0_level_0,Description,Actual,Predicted
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
114,"(0, 763)\t0.22776178902354552\n (0, 1426)\t...",1,1
229,"(0, 436)\t0.5142302409478362\n (0, 899)\t0....",0,0
319,"(0, 976)\t0.44769656250903483\n (0, 952)\t0...",1,1
178,"(0, 561)\t0.294323572750863\n (0, 484)\t0.2...",0,0
331,"(0, 1071)\t0.07958366482269486\n (0, 253)\t...",1,1
201,"(0, 420)\t0.20665864462184405\n (0, 1095)\t...",0,0
154,"(0, 1313)\t0.127788524616276\n (0, 677)\t0....",0,0
263,"(0, 541)\t0.27396017341014417\n (0, 191)\t0...",1,1
90,"(0, 1202)\t0.11089037058577687\n (0, 994)\t...",0,0
146,"(0, 637)\t0.17562914123159465\n (0, 838)\t0...",1,1


In [15]:
# Step 1: Extract the 'Description' column from vehicle_df_clean
X = vehicle_df_clean['Description']

# Step 2: Feature Extraction (TF-IDF)
# Use the same vectorizer used during training to convert the text data into TF-IDF features
X_tfidf = vectorizer.transform(X)

# Step 3: Make Predictions on the new data
y_pred = final_model.predict(X_tfidf)

# Step 4: Add the predictions to the vehicle_df_clean DataFrame
vehicle_df_clean['Rebuilt_Predicted'] = y_pred

# Display the DataFrame with the added 'Rebuilt_Predicted' column
vehicle_df_clean

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt_Predicted
0,Dieppe,NB,2015,Honda,Civic,14000,170000,facebook.com/marketplace/item/3549035535373825...,***PREFERABLY TRADES***(would want something s...,0
1,Riverview,NB,2019,Honda,Civic,24999,61000,facebook.com/marketplace/item/255560547337016/...,"2019 Honda Civic EX Great fuel efficient car, ...",0
2,Moncton,NB,2018,Honda,Civic,20495,149000,facebook.com/marketplace/item/263120729766969/...,"LOADED CIVIC EX! One year warranty at no cost,...",0
3,Charlottetown,PE,2010,Honda,Civic,6500,159000,facebook.com/marketplace/item/249495091262857/...,New brake rotors and pad New battery .Winter t...,0
4,Halifax,NS,2011,Honda,Civic,9000,125000,facebook.com/marketplace/item/183227231277501/...,-Honda civic in amazing physical condition -A...,0
...,...,...,...,...,...,...,...,...,...,...
346,Vancouver,BC,2016,Honda,Civic,23250,74000,facebook.com/marketplace/item/1051329789364537...,2016 Blue Honda Civic EXT trim (turbo) Used wi...,0
347,Surrey,BC,2017,Honda,Civic,23299,61000,facebook.com/marketplace/item/280979407867602/...,CALL Abhay @ 604xxx657xxxx2414 TO CONFIRM AVA...,0
348,Surrey,BC,2007,Honda,Civic,5900,103000,facebook.com/marketplace/item/822504366053233/...,Clean title Runs strong and smooth Ac cold ver...,0
349,New Westminster,BC,2014,Honda,Civic,14990,93000,facebook.com/marketplace/item/941618010468070/...,"1.8L 4cyl 93,000 km only, some of the optio...",1


In [16]:
# Set the display option to show full URLs without truncation
pd.set_option('display.max_colwidth', None)

# Filter the DataFrame to show only entries where 'Rebuilt_Predicted' is 1
rebuilt_predicted_df = vehicle_df_clean[vehicle_df_clean['Rebuilt_Predicted'] == 1]

# Display the filtered DataFrame with full URLs
print(rebuilt_predicted_df["URL"])

31     facebook.com/marketplace/item/1475916109821523/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
38      facebook.com/marketplace/item/115076404993959/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
58     facebook.com/marketplace/item/9671371072904445/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
81      facebook.com/marketplace/item/296297749556678/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
95     facebook.com/marketplace/item/3363965300581410/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
96      facebook.com/marketplace/item/297497582786661/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
114    facebook.com/marketplace/item/1477491013024360/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
119    facebook.com/marketplace/item/7586079374772761/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
129     facebook

In [26]:
# Filter out the rows where 'Rebuilt_Predicted' is 1 and 'Rebuilt' is NaN
filtered_df = merged_df[(merged_df['Rebuilt_Predicted'] == 1) & (merged_df['Rebuilt'].isnull())]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt_Predicted,Rebuilt
38,Edmundston,NB,2017,Honda,Civic,21297,82000,facebook.com/marketplace/item/115076404993959/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD,"Civic ex 2017 with only 82,952km!7 -year or 160,000km warranty.Funding available up to 72 months.Good as bad credit do not worry you we will have you approved.Good or bad credit we will have a solution for your approval.If you have any questions don't hesitate to contact me.I give my number via Messenger.",1,
236,Moose Jaw,SK,2018,Honda,Civic,24500,78000,facebook.com/marketplace/item/249218911382261/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD,Selling my 2018 Honda Civic LX since I have another vehicle to drive. Currently has 78060 km on it. Excellent fuel economy (my avg is about 5.9L/100 km as seen in the photo). I purchased the vehicle from SGI due to some hail damange- it is NOT branded (NOT a rebuilt). The vehicle runs perfectly - has some hail damage on the hood which does not affect it's performance. Mechanical inspection completed on May 17/23. The vehicle has the following: 4 tires replaced May 21/23 Replaced both front windshield wipers with new ones May 12/23 2.0L I4 FI DOHC 16V NF4 automatic transmission FWD Remote starter Cruise Control Air Conditioning Back-up camera Heated seats Power and heated mirrors Power locks Power windows (one-touch up and down) Bluetooth Apple Carplay 2 USB ports $24500 OBO,1,
