# Loading the data

In [1]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.carwale.com/tata-cars/nexon/user-reviews-p2/")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
print(loader)

<langchain_community.document_loaders.web_base.WebBaseLoader object at 0x7b72506b2f10>


In [3]:
docs = loader.load()

In [18]:
docs

[Document(metadata={'source': 'https://www.carwale.com/tata-cars/nexon/user-reviews-p2/', 'title': 'Tata Nexon Reviews - CarWale', 'description': 'Tata Nexon Reviews - Read first-hand reviews from actual Tata Nexon owners. Find out what buyers of Tata Nexon have to say about the car.', 'language': 'en'}, page_content="\n\nTata Nexon Reviews - CarWale\n\n\n\n\n\n\n \n\n\n\n\n \n\n\n\n\n\nNEW CARSUSED CARSREVIEWS & NEWSADTata Nexon User ReviewsLooking for Tata Nexon? Here are the reviews and ratings by Nexon owners from across the country.4.6/5391 Ratings5 star75%4 star17%3 star3%2 star1%1 star3%VariantAll VersionsRs. 7,99,990Avg. Ex-ShowroomSelect Your VariantAll VersionsAll VersionsPure 1.2 Petrol 6MTPetrolManualRatings 4.6Smart Plus 1.2 Petrol 5MTPetrolManualRatings 4.6Creative 1.2 Petrol 6MTPetrolManualRatings 4.1Creative 1.2 Petrol 6AMTPetrolAutomaticRatings 4.4Smart Plus (S) 1.2 Petrol 5MTPetrolManualRatings 4.6Pure 1.5 Diesel 6MTDieselManualRatings 5.0Fearless Purple 1.2 Petrol 6M

# RAG

In [4]:
import re
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
import numpy as np
from langchain.vectorstores import Chroma
import os
import shutil
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Initialize the embedding model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device = "cpu")
class SentenceTransformerEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode(text).tolist()

# Initialize our custom embeddings
embeddings = SentenceTransformerEmbeddings(model)

In [6]:
# Function to clean and extract reviews
def extract_reviews(text):
    # Extract review blocks
    review_pattern = re.compile(r'(\d+ months ago \| .+?About the Reviewer)', re.DOTALL)
    reviews = review_pattern.findall(text)
    
    # Clean each review
    cleaned_reviews = []
    for review in reviews:
        # Remove HTML tags and extra whitespace
        clean_review = re.sub(r'<.*?>', '', review)
        clean_review = re.sub(r'\s+', ' ', clean_review).strip()
        cleaned_reviews.append(clean_review)
    
    return cleaned_reviews

In [7]:
x = docs[0]

In [8]:
embedding_data = model.encode(str(docs))

In [9]:
embedding_data

array([-4.97421212e-02,  4.56442200e-02,  2.07532048e-02,  4.62603718e-02,
       -3.17288050e-03,  2.84598377e-02,  1.63641591e-02,  4.51008528e-02,
       -2.05483045e-02, -8.04107711e-02,  7.79745728e-02, -7.45393485e-02,
       -1.92617550e-02,  1.67596363e-03, -2.07780022e-02,  4.10910845e-02,
        3.42805572e-02, -4.58252132e-02,  9.78834648e-03, -7.18596578e-02,
        2.60666925e-02,  3.82510088e-02,  6.50273561e-02, -4.50759940e-03,
        5.80998790e-03, -3.17513011e-02, -2.51366328e-02,  1.03435569e-01,
       -2.71665002e-03, -4.36415002e-02, -4.18471955e-02,  5.89236394e-02,
        5.53052640e-03,  6.42236928e-03, -5.02314372e-03, -6.32848665e-02,
       -7.76512325e-02,  1.48405088e-02,  4.05669548e-02, -1.58582162e-02,
        2.57344693e-02, -8.80463198e-02, -4.70317602e-02, -4.24393336e-04,
        7.13522881e-02, -1.32183656e-02, -7.16199726e-02,  1.12586524e-02,
        8.52428749e-02, -4.23713699e-02, -1.22171581e-01, -1.66677777e-02,
       -1.56489573e-02, -

In [10]:
# Remove old database files if any
persist_directory = os.path.join(os.getcwd(), 'docs', 'chroma')

os.makedirs(persist_directory, exist_ok=True)

shutil.rmtree(persist_directory, ignore_errors=True)

In [11]:
# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)
splits = text_splitter.split_documents(docs)

# Create the vector database
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [12]:
print(vectordb._collection.count())

33


In [13]:
def semantic_search(query: str, top_k: int = 5):
    # Perform the search
    docs_mmr = vectordb.max_marginal_relevance_search(query,k=top_k)
    
    # Format the results
    formatted_results = []
    for doc in docs_mmr:
        formatted_results.append({
            "content": doc.page_content,
            "metadata": doc.metadata,
        })
    
    return formatted_results

In [15]:
semantic_search("What do users say about the Tata Nexon's fuel economy?")[0]

{'content': 'NEW CARSUSED CARSREVIEWS & NEWSADTata Nexon User ReviewsLooking for Tata Nexon? Here are the reviews and ratings by Nexon owners from across the country.4.6/5391 Ratings5 star75%4 star17%3 star3%2 star1%1 star3%VariantAll VersionsRs. 7,99,990Avg. Ex-ShowroomSelect Your VariantAll VersionsAll VersionsPure 1.2 Petrol 6MTPetrolManualRatings 4.6Smart Plus 1.2 Petrol 5MTPetrolManualRatings 4.6Creative 1.2 Petrol 6MTPetrolManualRatings 4.1Creative 1.2 Petrol 6AMTPetrolAutomaticRatings 4.4Smart Plus',
 'metadata': {'description': 'Tata Nexon Reviews - Read first-hand reviews from actual Tata Nexon owners. Find out what buyers of Tata Nexon have to say about the car.',
  'language': 'en',
  'source': 'https://www.carwale.com/tata-cars/nexon/user-reviews-p2/',
  'title': 'Tata Nexon Reviews - CarWale'}}

# Loading the Model

In [6]:
import torch

torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained("Majipa/cars_base",
                                             device_map="cuda",
                                             torch_dtype=torch.float16,
                                             quantization_config=quantization_config)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "temperature": 0., 
    "return_full_text": False,
} 

# output = pipe(messages, **generation_args) 
# print(output[0]['generated_text'])

In [8]:
def answer(context, question):
    messages = [ 
{"role": "system", "content": "You are a helpful Car Improvement anaylst assistant that works on the basis of provied Reviws."},     
        {"role": "user", "content":f"context: {context} question: {question}"}, 
        ] 
    output = pipe(messages, **generation_args) 
    print(output[0]['generated_text'])

In [9]:
question = "what is the car that we are talking about?"
answer(docs,question)



OutOfMemoryError: CUDA out of memory. Tried to allocate 110.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 37.06 MiB is free. Including non-PyTorch memory, this process has 3.77 GiB memory in use. Of the allocated memory 3.37 GiB is allocated by PyTorch, and 332.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Data gen

In [2]:
import pandas as pd

In [6]:
x = pd.read_csv("tata_punch_reviews.csv", encoding='latin-1', dtype=str)

In [7]:
x

Unnamed: 0,Reviewer,Rating,Title,Content
0,narendra singh,5,Fantastic Car In Budget,"I recently purchased the Tata Punch, and I cou..."
1,sunny,5,Best Quality Car,Tata Punch is highly regarded as a suitable ca...
2,debee prasad sahoo,4.5,Best Safety Car For Middleclass People,It's very comfortable and it holds 4 people's ...
3,kunwarsa production,5,Best Car,The Tata Punch stands out as the top mini SUV ...
4,aman ali,5,Best Car,"Had a wonderful experience with Tata Punch,?he..."
...,...,...,...,...
850,armaan negi,4.7,Best SUV Of Tata.,Nice car to drive. Nice performance. But it ha...
851,bishnu halder,5,Nice Car,It is a nice car with a better road presence?n...
852,soumyadip dutta,4.7,Great Experience Overall,Best micro SUV in the range of 6-8 lakhs with ...
853,ganesh pudduchery,5,"5 Star Safety Rated Heavy, Rigid, Spacious SUV...","I Bought a Tata punch adventure a week ago, Ea..."


In [17]:
x.head()

Unnamed: 0,Reviewer,Rating,Title,Content
0,narendra singh,5.0,Fantastic Car In Budget,"I recently purchased the Tata Punch, and I cou..."
1,sunny,5.0,Best Quality Car,Tata Punch is highly regarded as a suitable ca...
2,debee prasad sahoo,4.5,Best Safety Car For Middleclass People,It's very comfortable and it holds 4 people's ...
3,kunwarsa production,5.0,Best Car,The Tata Punch stands out as the top mini SUV ...
4,aman ali,5.0,Best Car,"Had a wonderful experience with Tata Punch,?he..."


In [18]:
# Assuming your DataFrame is named df
with open('output.txt', 'w', encoding='utf-8') as file:
    for index, row in x.iterrows():
        # Format each row as desired
        formatted_line = f"Reviewer: {row['Reviewer']}\n"
        formatted_line += f"Rating: {row['Rating']}\n"
        formatted_line += f"Title: {row['Title']}\n"
        formatted_line += f"Content: {row['Content']}\n"
        formatted_line += "-" * 50 + "\n"  # Separator between entries
        
        # Write the formatted line to the file
        file.write(formatted_line)

print("Data has been written to output.txt")

Data has been written to output.txt
