In [20]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sqlalchemy import create_engine, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
import psycopg2
import joblib

In [26]:
# Read the full CSV file as a Pandas DataFrame
vehicle_df_clean = pd.read_csv('vehicles_clean_description_2.csv')
vehicle_df_clean.tail()

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description
479,Burnaby,BC,2020,Honda,Civic,29980,56000,facebook.com/marketplace/item/736089191537875/...,!!! 2020 HONDA CIVIC Sport !!! !! HONDA CERTI...
480,Burnaby,BC,2015,Honda,Civic,17985,116000,facebook.com/marketplace/item/157358407374132/...,Jim Pattison Auto Group 2015 Honda Civic EX ...
481,Langley,BC,2017,Honda,Civic,25841,94000,facebook.com/marketplace/item/805842554363760/...,PLEASE CALL OR TEXT LANGLEY AUTO LOANS FOR ASS...
482,Vancouver,BC,2012,Honda,Civic,14995,102000,facebook.com/marketplace/item/661097089253293/...,2012 Honda Civic Hybrid Sedannazar_amulet BH20...
483,Surrey,BC,2016,Honda,Civic,20980,90000,facebook.com/marketplace/item/4281251528766573...,Used CAR For Sale Clean title No accident Ca...


In [27]:
# Convert the 'Description' column to strings and replace NaN values with empty strings
vehicle_df_clean['Description'] = vehicle_df_clean['Description'].astype(str).fillna('')

In [4]:
# Tokenize the sentences using word_tokenize and then check for each word in the sentence if it matches any of 
# The target terms. If a match is found, we collect the 4 words before and after the matched word to form the matching_sentence.
# Only these sentences with a match and limited context are stored in the matching_sentences_dict dictionary.

#Create an empty list to store the matching results
matching_results_list = []

#Compile a regex pattern to match any of the target terms
target_terms = ['rebuilt', 'accident', 'clean', 'carfax', 'damage', 'car fax', 'car-fax', 'recertified']
pattern = re.compile(r'\b(?:' + '|'.join(target_terms) + r')\b', flags=re.IGNORECASE)

#Iterate through the DataFrame and find rows with the specified terms in the description
for index, row in vehicle_df_clean.iterrows():
    description = row['Description'].lower()
    url = row['URL']
    
    if pattern.search(description):
        sentences = sent_tokenize(description)
        matching_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            for i, word in enumerate(words):
                if pattern.search(word):
                    start_index = max(0, i - 4)
                    end_index = min(i + 5, len(words))
                    matching_sentence = ' '.join(words[start_index:end_index])
                    matching_sentences.append(matching_sentence)
        if matching_sentences:
            matching_result = {'Index': index, 'Sentences': matching_sentences, 'URL': url}
            matching_results_list.append(matching_result)

#Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(matching_results_list)

#Save the DataFrame as a CSV file
result_df.to_csv('matching_results2.csv', index=False)

# Manually go through the descriptions in the 'matching_results2.csv' one by one and assign a label of 0 or 1 to indicate whether the entry has the "rebuilt" status or not.

# Save the updated DataFrame to a new CSV file, which will serve as our training dataset.

In [5]:
#Read the CSV file as a Pandas DataFrame
df_filtered_matching_results = pd.read_csv('matching_results_filtered.csv', index_col='Index')
df_filtered_matching_results.head()

Unnamed: 0_level_0,Description,URL,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,['driven ( approximately ) clean title comes w...,facebook.com/marketplace/item/306105255150907/...,0
19,"['clean car fax , fresh']",facebook.com/marketplace/item/1545149072689077...,0
21,"['mag , bluetooth , carfax without accident.19...",facebook.com/marketplace/item/330387509316880/...,0
27,['but has never been accident.the interviews h...,facebook.com/marketplace/item/821401316043008/...,0
31,['4cyl | fwd | rebuilt title 197.000km 1.8l 4c...,facebook.com/marketplace/item/1475916109821523...,1


In [6]:
#Filter the entries in vehicle_df_clean based on the common index values
common_indexes = df_filtered_matching_results.index
filtered_vehicle_df_clean = vehicle_df_clean.loc[common_indexes]

#Merge the "Rebuilt" column from df_filtered_matching_results to filtered_vehicle_df_clean
filtered_vehicle_df_clean = filtered_vehicle_df_clean.merge(df_filtered_matching_results[['Rebuilt']], left_index=True, right_index=True, how='left')

filtered_vehicle_df_clean.head()

Unnamed: 0_level_0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11,Halifax,NS,2010,Honda,Civic,10800,100000,facebook.com/marketplace/item/306105255150907/...,HONDA CIVIC 2010 Only 105895 kms driven (appro...,0
19,Lunenburg Municipal District,NS,2019,Honda,Civic,29000,65000,facebook.com/marketplace/item/1545149072689077...,2019 Honda Civic Sport Hatchback New Condition...,0
21,Quebec,QC,2012,Honda,Civic,9991,194000,facebook.com/marketplace/item/330387509316880/...,"Honda Civic Ex, 2012, 1.8L automatic, sunroof,...",0
27,Sherbrooke,QC,2004,Honda,Civic,2500,175000,facebook.com/marketplace/item/821401316043008/...,Honda Civic 2004 Read well before Mercrien Sun...,0
31,Halifax,NS,2010,Honda,Civic,6990,197000,facebook.com/marketplace/item/1475916109821523...,2010 Honda Civic Sdn DX-G | 1.8L 4Cyl | FWD | ...,1


# Training a Natural Language Processing (NLP) model to predict whether a car is rebuilt or not based on the listing description

In [7]:
# Data Preprocessing
X = filtered_vehicle_df_clean['Description']
y = filtered_vehicle_df_clean['Rebuilt']

In [8]:
# Feature Extraction (TF-IDF) (calculate a weight for each term in each document.)
# stop_words='english' argument removes common English stopwords (e.g., 'the', 'and', 'in', etc.)
# max_features=5000 argument limits the number of features to the top 5000 most frequent words
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [9]:
#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [10]:
#Model Training (Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [11]:
#Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5925925925925926


In [12]:
# Define hyperparameter grid for Logistic Regression tuning
# C controls how much the model should penalize large coefficients (weights i.e. "rebuilt term") of the features 
#( large C = weaker regularization)
# Large C may lead to overfitting if the training data is noisy or has many irrelevant features.
# l1 (Lasso regularization):Useful for feature selection when you suspect that many features are irrelevant to the target 
# l2 (Ridge regularization): Rarely sets coefficients exactly to zero. Useful to avoid overfitting when we have many relevant 
#features.

param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize the Logistic Regression model
model = LogisticRegression()

# Perform Grid Search
# cv=5: data is divided into 5 folds, model will be trained on 4 folds and evaluated on the remaining fold, performance 
# metrics are then averaged
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the final model with the best hyperparameters on the full dataset
final_model = best_model.fit(X_tfidf, y)

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Victor\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.67619048 0.66666667 0.

In [13]:
# Make predictions on the test set
y_test_pred = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy of the best model: {accuracy}")

Accuracy of the best model: 1.0


In [14]:
# Create a DataFrame with Predicted and Actual Values
df_predictions = pd.DataFrame({'Description': X_test, 'Actual': y_test, 'Predicted': y_test_pred})

df_predictions.sample(20)

Unnamed: 0_level_0,Description,Actual,Predicted
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
154,"(0, 1313)\t0.127788524616276\n (0, 677)\t0....",0,0
331,"(0, 1071)\t0.07958366482269486\n (0, 253)\t...",1,1
196,"(0, 1270)\t0.16374633012617687\n (0, 996)\t...",1,1
319,"(0, 976)\t0.44769656250903483\n (0, 952)\t0...",1,1
146,"(0, 637)\t0.17562914123159465\n (0, 838)\t0...",1,1
165,"(0, 725)\t0.36899904278921325\n (0, 1017)\t...",0,0
312,"(0, 939)\t0.3933696265073738\n (0, 373)\t0....",1,1
345,"(0, 268)\t0.11728930697314242\n (0, 275)\t0...",0,0
278,"(0, 7)\t0.17044634167501777\n (0, 729)\t0.1...",1,1
86,"(0, 1021)\t0.24590747143122943\n (0, 200)\t...",0,0


In [22]:
# Save the final_model to a file
joblib.dump(final_model, 'rebuilt_final_model.joblib')

['rebuilt_final_model.joblib']

In [23]:
loaded_model = joblib.load('rebuilt_final_model.joblib')

In [28]:
#Make predictions on "vehicle_df_clean"

# Extract the 'Description' column from vehicle_df_clean
X = vehicle_df_clean['Description']

# Feature Extraction (TF-IDF)
# Use the same vectorizer used during training to convert the text data into TF-IDF features
X_tfidf = vectorizer.transform(X)

# Make Predictions on the new data
y_pred = loaded_model.predict(X_tfidf)

# Add the predictions to the vehicle_df_clean DataFrame
vehicle_df_clean['Rebuilt_Predicted'] = y_pred

# Display the DataFrame with the added 'Rebuilt_Predicted' column
vehicle_df_clean

Unnamed: 0,City,Province,Year,Make,Model,Price,Mileage,URL,Description,Rebuilt_Predicted
0,Salisbury Parish,NB,2003,Honda,Civic,1200,127000,facebook.com/marketplace/item/3585311225073988...,No rust no micanical issues inspected till feb...,0
1,Dieppe,NB,2017,Honda,Civic,22000,139000,facebook.com/marketplace/item/776450681149897/...,2017 Honda civic hatchback LX A/C New brakes ...,0
2,Dieppe,NB,2015,Honda,Civic,14000,170000,facebook.com/marketplace/item/3549035535373825...,***PREFERABLY TRADES***(would want something s...,0
3,Riverview,NB,2019,Honda,Civic,24999,61000,facebook.com/marketplace/item/255560547337016/...,"2019 Honda Civic EX Great fuel efficient car, ...",0
4,Moncton,NB,2018,Honda,Civic,20495,149000,facebook.com/marketplace/item/263120729766969/...,"LOADED CIVIC EX! One year warranty at no cost,...",0
...,...,...,...,...,...,...,...,...,...,...
479,Burnaby,BC,2020,Honda,Civic,29980,56000,facebook.com/marketplace/item/736089191537875/...,!!! 2020 HONDA CIVIC Sport !!! !! HONDA CERTI...,0
480,Burnaby,BC,2015,Honda,Civic,17985,116000,facebook.com/marketplace/item/157358407374132/...,Jim Pattison Auto Group 2015 Honda Civic EX ...,0
481,Langley,BC,2017,Honda,Civic,25841,94000,facebook.com/marketplace/item/805842554363760/...,PLEASE CALL OR TEXT LANGLEY AUTO LOANS FOR ASS...,0
482,Vancouver,BC,2012,Honda,Civic,14995,102000,facebook.com/marketplace/item/661097089253293/...,2012 Honda Civic Hybrid Sedannazar_amulet BH20...,0


In [30]:
# Set the option to show the full URLs in the DataFrame
pd.set_option("display.max_colwidth", None)

# Filter the DataFrame to show only entries where 'Rebuilt_Predicted' is 1
rebuilt_predicted_df = vehicle_df_clean[vehicle_df_clean['Rebuilt_Predicted'] == 1]

# Display the filtered DataFrame with full URLs
print(rebuilt_predicted_df["URL"])

23     facebook.com/marketplace/item/3254056658230013/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
41     facebook.com/marketplace/item/3363965300581410/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
43      facebook.com/marketplace/item/824922348958886/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
64      facebook.com/marketplace/item/297497582786661/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
92     facebook.com/marketplace/item/1090437038599336/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
116    facebook.com/marketplace/item/1477491013024360/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
138     facebook.com/marketplace/item/200233046084339/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
156     facebook.com/marketplace/item/679620574029759/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD
225    facebook.

In [31]:
# Save vehicle_df_clean with labels to a CSV file
vehicle_df_clean.to_csv('vehicle_df_clean_labeled_2.csv', index=False)

In [32]:
# Save vehicle_df_clean with labels to a DB

# Define the SQLAlchemy base class
Base = declarative_base()

# Define the vehicle_listings table as a SQLAlchemy model
class VehicleListing(Base):
    __tablename__ = 'vehicle_listings'
    id = Column(Integer, primary_key=True)
    City = Column(String)
    Province = Column(String)
    Year = Column(Integer)
    Make = Column(String)
    Model = Column(String)
    Price = Column(Float)
    Mileage = Column(Integer)
    URL = Column(String)
    Rebuilt_Predicted = Column(Integer)

# Create the PostgreSQL database engine
engine = create_engine('postgresql://postgres:140494@localhost:5433/vehicle_clean_labeled_2')

# Create the tables in the database
Base.metadata.create_all(engine)

# Create a session to interact with the database
Session = sessionmaker(bind=engine)
session = Session()

# Insert data into the table
for _, row in vehicle_df_clean.iterrows():
    vehicle = VehicleListing(City=row['City'], Province=row['Province'], Year=row['Year'], Make=row['Make'],
                             Model=row['Model'], Price=row['Price'], Mileage=row['Mileage'], URL=row['URL'], Rebuilt_Predicted=row['Rebuilt_Predicted'])
    session.add(vehicle)

# Commit the changes
session.commit()

# Close the session
session.close()