# Loading the data

In [77]:
with open('output.txt', 'r', encoding='utf-8') as file:
    content = file.read()
print(content)

Reviewer: narendra singh
Rating: 5
Title: Fantastic Car In Budget
Content: I recently purchased the Tata Punch, and I couldn't be happier with its performance. It offers excellent mileage and fits well within my budget as a mini SUV. Moreover, the service costs are minimal. Overall, the Tata Punch has exceeded my expectations. I've driven it both on highways and off-road, and it performs admirably in both scenarios. Its g...
--------------------------------------------------
Reviewer: sunny
Rating: 5
Title: Best Quality Car
Content: Tata Punch is highly regarded as a suitable car for middle-class families, offering a good balance of safety, quality, and affordability. It's praised for its practicality, fuel efficiency, and the 5-star safety rating it received. Users have noted its comfort for small families and its performance as a compact SUV. With variants available in both ...
--------------------------------------------------
Reviewer: debee prasad sahoo
Rating: 4.5
Title: Best Saf

# RAG

In [20]:
import re
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
import numpy as np
from langchain.vectorstores import Chroma
import os
import shutil
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from tqdm.autonotebook import tqdm, trange


In [21]:
# Initialize the embedding model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device = "cpu")
class SentenceTransformerEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode(text).tolist()

# Initialize our custom embeddings
embeddings = SentenceTransformerEmbeddings(model)

In [28]:
docs = content

In [23]:
embedding_data = model.encode(docs)

In [24]:
embedding_data

array([-2.71429215e-02,  3.29203829e-02, -9.64924984e-04,  7.38558248e-02,
       -1.34932855e-02,  5.86739480e-02,  3.54242437e-02,  5.00740595e-02,
       -7.00832019e-03,  3.74677591e-02,  2.36249883e-02, -4.20896597e-02,
        4.32286672e-02,  1.72054898e-02,  2.58541424e-02, -2.11088005e-02,
        1.23030387e-01, -6.76872507e-02,  2.49624755e-02, -1.30014466e-02,
       -5.76321222e-02,  4.21486236e-02,  5.23755848e-02,  1.84893217e-02,
       -1.19141564e-02, -5.14650941e-02,  1.09351482e-02,  5.95539063e-02,
       -4.46004272e-02, -7.45194703e-02,  3.56048234e-02,  7.17349574e-02,
        8.34872201e-03, -6.98452676e-03, -7.44187012e-02, -8.56726691e-02,
       -3.58609669e-02,  2.54690237e-02, -5.30196726e-02, -4.76797298e-02,
       -1.78294089e-02, -3.08985859e-02,  2.68147737e-02, -4.52666841e-02,
        4.83851731e-02, -5.60252666e-02,  1.14959208e-02, -6.04068451e-02,
        6.51977537e-03, -4.02114019e-02,  7.01388530e-03, -3.54938805e-02,
        7.63224578e-03, -

In [25]:
# Remove old database files if any
persist_directory = os.path.join(os.getcwd(), 'docs', 'chroma')

os.makedirs(persist_directory, exist_ok=True)

shutil.rmtree(persist_directory, ignore_errors=True)

In [36]:
from langchain.docstore.document import Document

split_data = docs.split("--------------------------------------------------")
split_data = [review.strip() for review in split_data if review.strip()]
documents = [Document(page_content=review) for review in split_data]
documents

[Document(page_content="Reviewer: narendra singh\nRating: 5\nTitle: Fantastic Car In Budget\nContent: I recently purchased the Tata Punch, and I couldn't be happier with its performance. It offers excellent mileage and fits well within my budget as a mini SUV. Moreover, the service costs are minimal. Overall, the Tata Punch has exceeded my expectations. I've driven it both on highways and off-road, and it performs admirably in both scenarios. Its g..."),
 Document(page_content="Reviewer: sunny\nRating: 5\nTitle: Best Quality Car\nContent: Tata Punch is highly regarded as a suitable car for middle-class families, offering a good balance of safety, quality, and affordability. It's praised for its practicality, fuel efficiency, and the 5-star safety rating it received. Users have noted its comfort for small families and its performance as a compact SUV. With variants available in both ..."),
 Document(page_content="Reviewer: debee prasad sahoo\nRating: 4.5\nTitle: Best Safety Car For Midd

In [39]:
documents[2]

Document(page_content="Reviewer: debee prasad sahoo\nRating: 4.5\nTitle: Best Safety Car For Middleclass People\nContent: It's very comfortable and it holds 4 people's easily and it's design is looks good. It's a very good car for drive and best features in car")

In [40]:
# Create the vector database
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [41]:
print(vectordb._collection.count())

855


In [80]:
def semantic_search(query: str, top_k: int = 10):
    # Perform the search
    docs_mmr = vectordb.max_marginal_relevance_search(query,k=top_k)
    
    # Format the results
    formatted_results = []
    for doc in docs_mmr:
        formatted_results.append({
            "content": doc.page_content,
            "metadata": doc.metadata,
        })
    
    return formatted_results

In [81]:
semantic_search("What do users say about the Tata Punch's fuel economy?")[4]

{'content': 'Reviewer: vikram\nRating: 4.5\nTitle: Tata Punch CNG Delivers Eco Friendly Charges\nContent: The Tata Punch CNG is nobody short of a disclosure in the world of eco-friendly driving. This compact SUV seamlessly blends authority and sustainability, offering an unusual driving experience while being ready on the terrain. The CNG variant boasts a potent machine that does not compromise on interpretation, making megacity commutes a breath. What...',
 'metadata': {}}

# Loading the Model

In [78]:
import torch

torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained("Majipa/cars_base",
                                             device_map="cuda",
                                             torch_dtype=torch.float16,
                                             quantization_config=quantization_config)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [103]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "min_new_tokens":500,
    "max_new_tokens": 1500, 
    "temperature": 0.5, 
    "return_full_text": False,
    "do_sample":True
} 

# output = pipe(messages, **generation_args) 
# print(output[0]['generated_text'])

In [104]:
def answer(question):
    context = semantic_search(question)
    messages = [ 
    {"role": "system", "content": "You are a helpful Car Improvement anaylst assistant that works on the basis of provied Reviews and gives described information in around 500-1500 words."},     
        {"role": "user", "content":f"context: {context} question: {question}"}, 
        ] 
    output = pipe(messages, **generation_args) 
    print(output[0]['generated_text'])

In [106]:
question = "Can you make a list of things that can be improved in this particular Car?"
answer(question)

 There are some areas that can be improved such as interior and finishing To improve the interior and finishing look There are areas that need improvement in interior and finishing The interior and finishing could benefit from some attention Some areas of the car could be improved in terms of interior and finishing Some aspects related to the interior could be upgraded for better aesthetics The car could use more attention to improve the interior and finish The interior could use some enhancement in terms of finish The finishing and interior could be improved for a better visual appeal The look of the interior and finish could be enhanced for a better visual appeal Some areas of the car could use refinement in interior and finishing Some aspects of the interior and finish could use a bit of attention The interior and finishing could use some improvements Areas like the interior and finish could use some enhancement The look of the interior and finish could use a refresh for a nicer app

# Data gen

In [2]:
import pandas as pd

In [47]:
x = pd.read_csv("tata_punch_reviews.csv", encoding='latin-1', dtype=str)

In [48]:
x

Unnamed: 0,Reviewer,Rating,Title,Content
0,narendra singh,5,Fantastic Car In Budget,"I recently purchased the Tata Punch, and I cou..."
1,sunny,5,Best Quality Car,Tata Punch is highly regarded as a suitable ca...
2,debee prasad sahoo,4.5,Best Safety Car For Middleclass People,It's very comfortable and it holds 4 people's ...
3,kunwarsa production,5,Best Car,The Tata Punch stands out as the top mini SUV ...
4,aman ali,5,Best Car,"Had a wonderful experience with Tata Punch,?he..."
...,...,...,...,...
850,armaan negi,4.7,Best SUV Of Tata.,Nice car to drive. Nice performance. But it ha...
851,bishnu halder,5,Nice Car,It is a nice car with a better road presence?n...
852,soumyadip dutta,4.7,Great Experience Overall,Best micro SUV in the range of 6-8 lakhs with ...
853,ganesh pudduchery,5,"5 Star Safety Rated Heavy, Rigid, Spacious SUV...","I Bought a Tata punch adventure a week ago, Ea..."


In [76]:
print(x['Content'][11])

The Tata Punch is a value-for-money machine packed with stunning design, looks, and performance and is spacious for a compact SUV, the infotainment system is also very well designed. Rides in the Tata Punch are fun in the cities and can be a little adventurous too. Overall if your budget is somewhere between 6-10L get this 5 starred safe and fun-lo...


In [18]:
# Assuming your DataFrame is named df
with open('output.txt', 'w', encoding='utf-8') as file:
    for index, row in x.iterrows():
        # Format each row as desired
        formatted_line = f"Reviewer: {row['Reviewer']}\n"
        formatted_line += f"Rating: {row['Rating']}\n"
        formatted_line += f"Title: {row['Title']}\n"
        formatted_line += f"Content: {row['Content']}\n"
        formatted_line += "-" * 50 + "\n"  # Separator between entries
        
        # Write the formatted line to the file
        file.write(formatted_line)

print("Data has been written to output.txt")

Data has been written to output.txt
