# Dataset Preprocessing

In [1]:
import requests
from zipfile import ZipFile
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import torch

In [2]:
url = url = "https://static.nhtsa.gov/odi/ffdd/rcl/FLAT_RCL.zip"
download_path = "FLAT_RCL.zip"

In [3]:
print(f"Downloading from {url}...")
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(download_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
    print(f"Downloaded to {download_path}.")
else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")
    response.raise_for_status()

Downloading from https://static.nhtsa.gov/odi/ffdd/rcl/FLAT_RCL.zip...
Downloaded to FLAT_RCL.zip.


In [4]:
extract_path = "/kaggle/working/"

In [5]:
print(f"Extracting {download_path} to {extract_path}...")
with ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print(f"Extracted to {extract_path}.")

Extracting FLAT_RCL.zip to /kaggle/working/...
Extracted to /kaggle/working/.


In [6]:
# reading text file into csv for preprocessing
input_file = "/kaggle/working/FLAT_RCL.txt"
output_file = "/kaggle/working/preprocessed_data.csv"
columns = [f"Column_{i}" for i in range(27)]  # Placeholder column names
df = pd.read_csv(input_file, delimiter="\t", header=None, names=columns, on_bad_lines='skip', engine='python')

In [7]:
df.head()

Unnamed: 0,Column_0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_17,Column_18,Column_19,Column_20,Column_21,Column_22,Column_23,Column_24,Column_25,Column_26
0,1,02V288000,FORD,FOCUS,2000,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215021000000202,,,
1,2,02V288000,FORD,FOCUS,2001,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215022000000202,,,
2,3,02V236000,JAYCO,FT EAGLE 10 SG,2003,,EQUIPMENT:OTHER:LABELS,"JAYCO, INC.",20020730.0,20020813.0,...,,,"ON CERTAIN FOLDING TENT CAMPERS, THE FEDERAL C...","IF THE TIRES WERE INFLATED TO 80 PSI, THEY COU...",OWNERS WILL BE MAILED CORRECT LABELS FOR INSTA...,"ALSO, CUSTOMERS CAN CONTACT THE NATIONAL HIGHW...",000015210000106403000000349,,,
3,4,02V237000,HOLIDAY RAMBLER,ENDEAVOR,2000,,STRUCTURE,MONACO COACH CORP.,,,...,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000083965000000272,,,
4,5,02V237000,HOLIDAY RAMBLER,ENDEAVOR,1999,,STRUCTURE,MONACO COACH CORP.,,,...,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000080938000000272,,,


#### Visualising the columns to select which one is needed

In [8]:
i = 0
for column in columns:
    print(f"Column {i}: {df[column].iloc[1000]}")
    i+=1

Column 0: 1001
Column 1: 02V293000
Column 2: DODGE
Column 3: GRAND CARAVAN
Column 4: 1997
Column 5: B24
Column 6: AIR BAGS:FRONTAL
Column 7: DAIMLERCHRYSLER CORPORATION
Column 8: 19950101.0
Column 9: 19980228.0
Column 10: V
Column 11: 1500521.0
Column 12: 20021118.0
Column 13: ODI
Column 14: Chrysler (FCA US, LLC)
Column 15: 20021107
Column 16: 20021107
Column 17: nan
Column 18: nan
Column 19: ON CERTAIN MINI VANS, THE CLOCKSPRING ASSEMBLY MAY HAVE BEEN WOUND INCORRECTLY DURING THE VEHICLE ASSEMBLY PROCESS.
Column 21: DEALERS WILL REPLACE THE CLOCKSPRING ASSEMBLY ON ALL VEHICLES WITH 70,000 MILES OR LESS.  AN EXTENDED LIFETIME WARRANTY WILL ALSO BE PLACED ON THIS COMPONENT FOR ALL AFFECTED VEHICLES, REGARDLESS OF MILEAGE.  DAIMLERCHRYSLER WILL ALSO REIMBURSE OWNERS WHO HAVE PAID TO HAVE THE CLOCKSPRING REPLACED ON THEIR VEHICLES.  OWNER NOTIFICATION BEGAN NOVEMBER 18, 2002.    OWNERS WHO TAKE THEIR VEHICLES TO AN AUTHORIZED DEALER ON AN AGREED UPON SERVICE DATE AND DO NOT RECEIVE THE F

In [9]:
for column in columns:
    print(f"Number of unique values in {column} are {df[column].nunique()} and number of nulls are {df[column].isna().sum()}")

Number of unique values in Column_0 are 291330 and number of nulls are 0
Number of unique values in Column_1 are 28632 and number of nulls are 0
Number of unique values in Column_2 are 3355 and number of nulls are 0
Number of unique values in Column_3 are 18429 and number of nulls are 0
Number of unique values in Column_4 are 79 and number of nulls are 0
Number of unique values in Column_5 are 11308 and number of nulls are 122200
Number of unique values in Column_6 are 620 and number of nulls are 0
Number of unique values in Column_7 are 3818 and number of nulls are 5
Number of unique values in Column_8 are 6823 and number of nulls are 170741
Number of unique values in Column_9 are 6864 and number of nulls are 170499
Number of unique values in Column_10 are 6 and number of nulls are 0
Number of unique values in Column_11 are 9937 and number of nulls are 286
Number of unique values in Column_12 are 10560 and number of nulls are 34512
Number of unique values in Column_13 are 3 and number

NOTE: Selecting columns 2, 3, 4, 6, 19, 20, and 21 as most of other columns have either irrelevant information or lot of null values

In [10]:
# defining column names and select columns
df = df[['Column_2', 'Column_3', 'Column_4', 'Column_19', 'Column_20', 'Column_21']]
df.rename(columns={'Column_2': 'make',
                   'Column_3': 'model',
                   'Column_4': 'year',
                   'Column_19': 'defect',
                   'Column_20': 'consequence',
                   'Column_21': 'corrective'}, inplace=True)
df.head()

Unnamed: 0,make,model,year,defect,consequence,corrective
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...
1,FORD,FOCUS,2001,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...
2,JAYCO,FT EAGLE 10 SG,2003,"ON CERTAIN FOLDING TENT CAMPERS, THE FEDERAL C...","IF THE TIRES WERE INFLATED TO 80 PSI, THEY COU...",OWNERS WILL BE MAILED CORRECT LABELS FOR INSTA...
3,HOLIDAY RAMBLER,ENDEAVOR,2000,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...
4,HOLIDAY RAMBLER,ENDEAVOR,1999,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...


NOTE: defining combined text column which include combination of defect, consequence, and corrective for creating embeddings 

In [11]:
df['combined_text'] = df['defect'] + ' ' + df['consequence'] + ' ' + df['corrective']

# dropping the non required columns for space optimization
df.drop(['defect', 'consequence', 'corrective'], axis=1, inplace=True)

# saving the file
df.to_csv('combined_df.csv', index=False)

Since we need to select only Ford or Toyota values as subset

In [12]:
# finding the number of uniques makes in dataset
df['make'].unique()

array(['FORD', 'JAYCO', 'HOLIDAY RAMBLER', ..., 'SAFE FLEET',
       'AMERICAN FURUKAWA', 'IOSIX'], dtype=object)

In [13]:
# selecting ford and toyota rows only
filtered_df = df[(df['make'] == 'FORD') | (df['make'] == 'TOYOTA')]
filtered_df['make'].unique()

array(['FORD', 'TOYOTA'], dtype=object)

In [14]:
filtered_df.to_csv('filtered_df.csv', index=False)

We also need to check null values as its text data so needed to be dropped

In [15]:
# checking for null values
filtered_df.isnull().sum()

make                0
model               0
year                0
combined_text    1760
dtype: int64

In [16]:
# percentage of null values in dataset
print(f"Percentage of null values in data is: {(filtered_df['combined_text'].isnull().sum()/len(filtered_df))*100}")

Percentage of null values in data is: 10.271374379924133


In [17]:
# dropping the null values
filtered_df = filtered_df.dropna()
print(filtered_df.isnull().any())

# saving the dataset
filtered_df.to_csv('filtered_df.csv', index=False)

make             False
model            False
year             False
combined_text    False
dtype: bool


#### Estimating the size of input data
We need to approximately estimate the size of text we will be processing for selecting our embedding model

In [18]:
# loading spaCy
nlp = spacy.load("en_core_web_sm")

# reading the dataset
input_file = "/kaggle/working/filtered_df.csv"
df = pd.read_csv(input_file)

In [19]:
# creaiting function for return number of characters, words, sentences, and approximate tokens
def text_metrics_spacy(text):
    doc = nlp(text)
    char_count = len(text)
    word_count = len([token for token in doc if token.is_alpha])
    sentence_count = len(list(doc.sents))
    token_count = len(set([token.text.lower() for token in doc if token.is_alpha]))
    return char_count, word_count, sentence_count, token_count

In [20]:
tqdm.pandas()
df[['char_count', 'word_count', 'sentence_count', 'token_count']] = df['combined_text'].progress_apply(
    lambda x: pd.Series(text_metrics_spacy(x))
)

# Save the dataset
df.to_csv("/kaggle/working/Ford_Toyota_data_with_counts_spacy.csv", index=False)

100%|██████████| 15375/15375 [07:07<00:00, 35.98it/s]


In [21]:
df = pd.read_csv('/kaggle/working/Ford_Toyota_data_with_counts_spacy.csv')
df.head()

Unnamed: 0,make,model,year,combined_text,char_count,word_count,sentence_count,token_count
0,FORD,FOCUS,2000,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,770,115,7,81
1,FORD,FOCUS,2001,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,770,115,7,81
2,FORD,CROWN VICTORIA,2002,"ON CERTAIN NATURAL GAS MODEL VEHICLES, A T-FIT...",618,100,5,74
3,FORD,NAVIGATOR,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,825,124,10,83
4,FORD,EXPEDITION,2000,CERTAIN 4X2 SPORT UTILITY VEHICLES FAIL TO COM...,825,124,10,83


### Analyisng the dataset for choosing the embedding model

In [22]:
df.describe()

Unnamed: 0,year,char_count,word_count,sentence_count,token_count
count,15375.0,15375.0,15375.0,15375.0,15375.0
mean,2025.79187,819.422179,117.708813,7.624065,77.466732
std,346.763157,271.456147,37.45319,1.828028,18.834259
min,1965.0,115.0,15.0,3.0,13.0
25%,2005.0,641.0,92.0,7.0,65.0
50%,2012.0,798.0,115.0,7.0,77.0
75%,2020.0,975.0,138.0,8.0,87.0
max,9999.0,2517.0,342.0,19.0,187.0


NOTE: Since the max `token_count` doesn't exceed 384, we can easily use `all-mpnet-base-v2` for creating embeddings

In [23]:
# defining device for utilising GPUs
device = "cuda" if torch.cuda.is_available() else "cpu"

In [24]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [25]:
# loading embedding model
from sentence_transformers import util, SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Creating embedding vectors for combined_text

In [26]:
# creating embeddings column
df['embedding'] = None
batch_size = 64

# running in batches for utilsing GPUs, will take around 2 minutes
for start in range(0, len(df), batch_size):
    end = min(start + batch_size, len(df))
    batch_texts = df['combined_text'].iloc[start:end].tolist()
    batch_embeddings = embedding_model.encode(batch_texts, device=device, convert_to_tensor=True).cpu().tolist()
    df.loc[start:end - 1, 'embedding'] = pd.Series(batch_embeddings).values

print("Embeddings generated successfully.")

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings generated successfully.


In [27]:
df.to_csv('final_embedded_dataset.csv')

# Searching

NOTE: The reloading of dataset convert embedding column back to string, so we need to re convert it to mathematical form for calculatons.

In [28]:
# loading the dataset
df = pd.read_csv('/kaggle/working/final_embedded_dataset.csv')
# converting it back to array
df["embedding"] = df["embedding"].apply(lambda x: torch.tensor(eval(x), dtype=torch.float32))

#### Input of user

In [29]:
query_input = {
    "make": "ford",
    "model": "escape",
    "year": "2001",
    "issue": "stuck throttle risk"
}

### Searching by the model

In [30]:
# filtering dataset based on 'make', 'model', and 'year'
filtered_df = df[
    (df["make"].str.lower() == query_input["make"].lower()) &
    (df["model"].str.lower() == query_input["model"].lower()) &
    (df["year"] == int(query_input["year"]))
]

In [31]:
# similarity searching
if filtered_df.empty:
    print("No matching records found for the given make, model, and year.")
else:
    embeddings = torch.stack(filtered_df["embedding"].tolist()).to(device)

    # encoding the query issue
    query = query_input["issue"]
    print(f"Query: {query}")
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).to(device)

    # perform similarity search
    from time import perf_counter as timer
    start_time = timer()
    dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
    end_time = timer()

    print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    # top k results
    top_results = torch.topk(dot_scores, k=min(1, len(embeddings)))

    # initialising input_text for passing in LLM for summarization
    input_text = ""

    # printing the results
    print(f"Query: {query_input}")
    print("Results:")
    for score, idx in zip(top_results[0], top_results[1]):
        print(f"Score: {score:.4f}")
        print(f"Make: {filtered_df['make'].iloc[int(idx)]}")
        print(f"Model: {filtered_df['model'].iloc[int(idx)]}")
        print(f"Year: {filtered_df['year'].iloc[int(idx)]}")
        print(f"Description: {filtered_df['combined_text'].iloc[int(idx)]}")
        print()
        input_text += f"Score: {score:.4f}\n Make: {filtered_df['make'].iloc[int(idx)]}\n Year: {filtered_df['year'].iloc[int(idx)]}\n Description: {filtered_df['combined_text'].iloc[int(idx)]}"

Query: stuck throttle risk


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Time taken to get scores on 17 embeddings: 0.00740 seconds.
Query: {'make': 'ford', 'model': 'escape', 'year': '2001', 'issue': 'stuck throttle risk'}
Results:
Score: 0.5411
Make: FORD
Model: ESCAPE
Year: 2001
Description: Ford Motor Company is recalling certain model year 2001 through 2004 Escape vehicles equipped with 3.0L V6 engines and speed control manufactured from October 22, 1999, through January 23, 2004.  Inadequate clearance between the engine cover and the speed control cable connector could result in a stuck throttle when the accelerator pedal is fully or almost-fully depressed.  This risk exists regardless of whether or not speed control (cruise control) is used. A stuck throttle may result in very high vehicle speeds and make it difficult to stop or slow the vehicle, which could cause a crash, serious injury or death.  Ford will notify owners, and dealers will repair the vehicles by increasing the engine cover clearance, free of charge.  The safety recall began Au

# Summarizaton

In [32]:
import transformers

In [33]:
# loading the model
model_id = '/kaggle/input/llama-3.1/transformers/8b/2'
pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [34]:
# checking input text
print(input_text)

Score: 0.5411
 Make: FORD
 Year: 2001
 Description: Ford Motor Company is recalling certain model year 2001 through 2004 Escape vehicles equipped with 3.0L V6 engines and speed control manufactured from October 22, 1999, through January 23, 2004.  Inadequate clearance between the engine cover and the speed control cable connector could result in a stuck throttle when the accelerator pedal is fully or almost-fully depressed.  This risk exists regardless of whether or not speed control (cruise control) is used. A stuck throttle may result in very high vehicle speeds and make it difficult to stop or slow the vehicle, which could cause a crash, serious injury or death.  Ford will notify owners, and dealers will repair the vehicles by increasing the engine cover clearance, free of charge.  The safety recall began August 3, 2012.  Remedy parts are expected to be available in mid-August 2012.  Until then dealers will disconnect the speed control cable as an interim remedy, if parts are not av

In [35]:
# defining modified prompt for LLM
summarization_prompt = f"Summarize the following automotive issues:\n\n{input_text}\n\nSummary:"

# getting result
result = pipeline(summarization_prompt, max_new_tokens=500)
summary = result[0]["generated_text"]

# printing retreived documents
print(f"Retrieved Documents: {input_text}\n\n")

# printing required part of the gnerated_text
print(summary.split("\n\n")[-1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved Documents: Score: 0.5411
 Make: FORD
 Year: 2001
 Description: Ford Motor Company is recalling certain model year 2001 through 2004 Escape vehicles equipped with 3.0L V6 engines and speed control manufactured from October 22, 1999, through January 23, 2004.  Inadequate clearance between the engine cover and the speed control cable connector could result in a stuck throttle when the accelerator pedal is fully or almost-fully depressed.  This risk exists regardless of whether or not speed control (cruise control) is used. A stuck throttle may result in very high vehicle speeds and make it difficult to stop or slow the vehicle, which could cause a crash, serious injury or death.  Ford will notify owners, and dealers will repair the vehicles by increasing the engine cover clearance, free of charge.  The safety recall began August 3, 2012.  Remedy parts are expected to be available in mid-August 2012.  Until then dealers will disconnect the speed control cable as an interim remedy