In [24]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [7]:
#Read the full CSV file as a Pandas DataFrame
vehicle_df_clean = pd.read_csv('vehicles_clean_description.csv')
vehicle_df_clean.head()

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description
0,Dieppe,NB,2015,Honda,Civic,14000,170000,facebook.com/marketplace/item/3549035535373825...,***PREFERABLY TRADES***(would want something s...
1,Riverview,NB,2019,Honda,Civic,24999,61000,facebook.com/marketplace/item/255560547337016/...,"2019 Honda Civic EX Great fuel efficient car, ..."
2,Moncton,NB,2018,Honda,Civic,20495,149000,facebook.com/marketplace/item/263120729766969/...,"LOADED CIVIC EX! One year warranty at no cost,..."
3,Charlottetown,PE,2010,Honda,Civic,6500,159000,facebook.com/marketplace/item/249495091262857/...,New brake rotors and pad New battery .Winter t...
4,Halifax,NS,2011,Honda,Civic,9000,125000,facebook.com/marketplace/item/183227231277501/...,-Honda civic in amazing physical condition -A...


In [10]:
# Convert the 'Description' column to strings and replace NaN values with empty strings
vehicle_df_clean['Description'] = vehicle_df_clean['Description'].astype(str).fillna('')

In [11]:
# Tokenize the sentences using word_tokenize and then check for each word in the sentence if it matches any of 
# The target terms. If a match is found, we collect the 4 words before and after the matched word to form the matching_sentence.
# Only these sentences with a match and limited context are stored in the matching_sentences_dict dictionary.

#Create an empty list to store the matching results
matching_results_list = []

#Compile a regex pattern to match any of the target terms
target_terms = ['rebuilt', 'accident', 'clean', 'carfax', 'damage', 'car fax', 'car-fax', 'recertified']
pattern = re.compile(r'\b(?:' + '|'.join(target_terms) + r')\b', flags=re.IGNORECASE)

#Iterate through the DataFrame and find rows with the specified terms in the description
for index, row in vehicle_df_clean.iterrows():
    description = row['Description'].lower()
    url = row['URL']
    
    if pattern.search(description):
        sentences = sent_tokenize(description)
        matching_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            for i, word in enumerate(words):
                if pattern.search(word):
                    start_index = max(0, i - 4)
                    end_index = min(i + 5, len(words))
                    matching_sentence = ' '.join(words[start_index:end_index])
                    matching_sentences.append(matching_sentence)
        if matching_sentences:
            matching_result = {'Index': index, 'Sentences': matching_sentences, 'URL': url}
            matching_results_list.append(matching_result)

#Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(matching_results_list)

#Save the DataFrame as a CSV file
result_df.to_csv('matching_results2.csv', index=False)

# Manually go through the descriptions in the 'matching_results2.csv' one by one and assign a label of 0 or 1 to indicate whether the entry has the "rebuilt" status or not.

# Save the updated DataFrame to a new CSV file, which will serve as our training dataset.

In [13]:
#Read the CSV file as a Pandas DataFrame
df_filtered_matching_results = pd.read_csv('matching_results_filtered.csv', index_col='Index')
df_filtered_matching_results.head()

Unnamed: 0_level_0,Description,URL,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,['driven ( approximately ) clean title comes w...,facebook.com/marketplace/item/306105255150907/...,0
19,"['clean car fax , fresh']",facebook.com/marketplace/item/1545149072689077...,0
21,"['mag , bluetooth , carfax without accident.19...",facebook.com/marketplace/item/330387509316880/...,0
27,['but has never been accident.the interviews h...,facebook.com/marketplace/item/821401316043008/...,0
31,['4cyl | fwd | rebuilt title 197.000km 1.8l 4c...,facebook.com/marketplace/item/1475916109821523...,1


In [23]:
#Filter the entries in vehicle_df_clean based on the common index values
common_indexes = df_filtered_matching_results.index
filtered_vehicle_df_clean = vehicle_df_clean.loc[common_indexes]

#Merge the "Rebuilt" column from df_filtered_matching_results to filtered_vehicle_df_clean
filtered_vehicle_df_clean = filtered_vehicle_df_clean.merge(df_filtered_matching_results[['Rebuilt']], left_index=True, right_index=True, how='left')

filtered_vehicle_df_clean.head()

Unnamed: 0_level_0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11,Halifax,NS,2010,Honda,Civic,10800,100000,facebook.com/marketplace/item/306105255150907/...,HONDA CIVIC 2010 Only 105895 kms driven (appro...,0
19,Lunenburg Municipal District,NS,2019,Honda,Civic,29000,65000,facebook.com/marketplace/item/1545149072689077...,2019 Honda Civic Sport Hatchback New Condition...,0
21,Quebec,QC,2012,Honda,Civic,9991,194000,facebook.com/marketplace/item/330387509316880/...,"Honda Civic Ex, 2012, 1.8L automatic, sunroof,...",0
27,Sherbrooke,QC,2004,Honda,Civic,2500,175000,facebook.com/marketplace/item/821401316043008/...,Honda Civic 2004 Read well before Mercrien Sun...,0
31,Halifax,NS,2010,Honda,Civic,6990,197000,facebook.com/marketplace/item/1475916109821523...,2010 Honda Civic Sdn DX-G | 1.8L 4Cyl | FWD | ...,1


# Training a Natural Language Processing (NLP) model to predict whether a car is rebuilt or not based on the listing description

In [34]:
# Data Preprocessing
X = filtered_vehicle_df_clean['Description']
y = filtered_vehicle_df_clean['Rebuilt']

In [35]:
#Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) # You can adjust max_features if needed
X_tfidf = vectorizer.fit_transform(X)

In [36]:
#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [37]:
#Model Training (Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [38]:
#Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5925925925925926
