In [1]:
import os
import re

folder_path = 'C:/Users/Suvith Shetty/OneDrive/Documents/Hackathon/text'

def clean_text(text):
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'[^a-zA-Z0-9\s.,?!\'"₹\u0C80-\u0CFF]', '', text)  
    text = text.strip()
    return text

def load_and_clean_text_files(folder_path):
    cleaned_texts = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                cleaned_text = clean_text(text)
                cleaned_texts.append(cleaned_text)
    
    return cleaned_texts

cleaned_texts = load_and_clean_text_files(folder_path)
print(cleaned_texts[:1])  


["In dry farming, sandalwood farming can be grown in the lowest water.This is a commercial name Sandalwood butt is not our Sandalwood. It is a Sandal Album. There is a Sandalwood Sandalwood that is about sixteen people in the Sampled cricket. Witch Is The Ideal Chemical Combination of Sandalwood Andre In the world of Sandalwood is available anywhere in the world, it is available in our Karnataka.Our Kannada Nadu Kannada Nadu Sandal Nadu Thousands of years ago, a thousand years ago, Andre's knowledge was amazing for thousands of years ago.There is a great contribution to the growth of groundwater but it is a scrub plant. This is a parasitic plant which is a bloody sandalwood download and the parasitic plant is a parasite parasite. It is a parasite kinri.Water and their here are amazing physical soils in the world We have recently had hundreds of acres of sandalwood Bertini and now we have 5,000 acres of 5,000 Biggest Sandalwood Sandalwood Cluster Koppal Trickstow India Biggest Sandalwoo

In [2]:
def split_into_paragraphs(text):
    paragraphs = text.split('\n')
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    return paragraphs

all_paragraphs = []
for cleaned_text in cleaned_texts:
    paragraphs = split_into_paragraphs(cleaned_text)
    all_paragraphs.extend(paragraphs)

print(all_paragraphs[:5]) 


["In dry farming, sandalwood farming can be grown in the lowest water.This is a commercial name Sandalwood butt is not our Sandalwood. It is a Sandal Album. There is a Sandalwood Sandalwood that is about sixteen people in the Sampled cricket. Witch Is The Ideal Chemical Combination of Sandalwood Andre In the world of Sandalwood is available anywhere in the world, it is available in our Karnataka.Our Kannada Nadu Kannada Nadu Sandal Nadu Thousands of years ago, a thousand years ago, Andre's knowledge was amazing for thousands of years ago.There is a great contribution to the growth of groundwater but it is a scrub plant. This is a parasitic plant which is a bloody sandalwood download and the parasitic plant is a parasite parasite. It is a parasite kinri.Water and their here are amazing physical soils in the world We have recently had hundreds of acres of sandalwood Bertini and now we have 5,000 acres of 5,000 Biggest Sandalwood Sandalwood Cluster Koppal Trickstow India Biggest Sandalwoo

In [3]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

paragraph_embeddings = model.encode(all_paragraphs, convert_to_tensor=True)

question = "What is sandalwood cultivation?"

question_embedding = model.encode(question, convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(question_embedding, paragraph_embeddings)
best_match_idx = cosine_scores.argmax()

print(f"Best matching paragraph:\n{all_paragraphs[best_match_idx]}")



Best matching paragraph:
Farmers who are born in the sandalwood hut You can't talk about crores of agriculture.It was also known as the largest sandalwood producer in the 1960s. The atmosphere is also a market for sandalwood by 2026.Agricultural Course Designed Anna. Encouragement.You can also learn to complete sandalwood cultivation through various models. This course is a certificate for the Freedom App Market Place.If so, you should also cultivate sandalwood and earn crores of rupees.


In [4]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="t5-small", tokenizer="t5-small")

context = all_paragraphs[best_match_idx]
answer = qa_pipeline(question=question, context=context)

print(f"Answer: {answer['answer']}")

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Answer: hut You can't talk about crores


In [5]:
def question_answer_pipeline(question, folder_path):
    cleaned_texts = load_and_clean_text_files(folder_path)
    
    all_paragraphs = []
    for cleaned_text in cleaned_texts:
        paragraphs = split_into_paragraphs(cleaned_text)
        all_paragraphs.extend(paragraphs)
    
    paragraph_embeddings = model.encode(all_paragraphs, convert_to_tensor=True)
    question_embedding = model.encode(question, convert_to_tensor=True)
    
    cosine_scores = util.pytorch_cos_sim(question_embedding, paragraph_embeddings)
    best_match_idx = cosine_scores.argmax()
    
    context = all_paragraphs[best_match_idx]
    answer = qa_pipeline(question=question, context=context)
    
    return answer['answer']

folder_path = 'C:/Users/Suvith Shetty/OneDrive/Documents/Hackathon/text'
question = "What is sandalwood cultivation?"
answer = question_answer_pipeline(question, folder_path)
print(f"Answer: {answer}")

Answer: hut You can't talk about crores


In [6]:
question = "When Water is not Enough?"
answer = question_answer_pipeline(question, folder_path)
print(f"Answer: {answer}")

Answer: Economics of Sandal, we have a Sandal Complex Telekondre


In [7]:
question = "Pesticides"
answer = question_answer_pipeline(question, folder_path)
print(f"Answer: {answer}")

Answer: every month every month. In an acre of sandalwood


In [8]:
model.save('sentence_transformer_model')

In [9]:
qa_pipeline.model.save_pretrained('qa_model')
qa_pipeline.tokenizer.save_pretrained('qa_model')

('qa_model\\tokenizer_config.json',
 'qa_model\\special_tokens_map.json',
 'qa_model\\tokenizer.json')

In [10]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

model = SentenceTransformer('sentence_transformer_model')

qa_model = AutoModelForQuestionAnswering.from_pretrained('qa_model')
tokenizer = AutoTokenizer.from_pretrained('qa_model')
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer)


In [11]:
import pickle
import torch

with open('paragraph_embeddings.pkl', 'wb') as f:
    pickle.dump(paragraph_embeddings, f)

with open('paragraph_embeddings.pkl', 'rb') as f:
    paragraph_embeddings = pickle.load(f)

In [12]:
model.save('saved_models/sentence_transformer')
qa_pipeline.model.save_pretrained('saved_models/qa_model')
qa_pipeline.tokenizer.save_pretrained('saved_models/qa_model')


('saved_models/qa_model\\tokenizer_config.json',
 'saved_models/qa_model\\special_tokens_map.json',
 'saved_models/qa_model\\tokenizer.json')

In [13]:
model = SentenceTransformer('saved_models/sentence_transformer')
qa_model = AutoModelForQuestionAnswering.from_pretrained('saved_models/qa_model')
tokenizer = AutoTokenizer.from_pretrained('saved_models/qa_model')
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer)