In [10]:
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize

# Load the HTML file
html_file_path = 'index.html'

with open(html_file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(content, 'lxml')

# Function to extract meaningful text including skills and proficiency levels
def extract_meaningful_text(soup):
    text = []

    # Extract content from specific tags while preserving order and context
    for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li', 'table', 'strong']):
        if tag.name == 'table':
            text.append(" ".join(tag.stripped_strings))  # Tables as combined text
        else:
            text.append(tag.get_text(separator=' ', strip=True))  # Normal text elements

    return "\n".join(text)

# Function to extract skills and proficiency levels
def extract_skills(soup):
    skills_data = []
    
    # Find all barWrappers that contain the skills information
    for bar_wrapper in soup.find_all(class_='barWrapper'):
        skill = bar_wrapper.find(class_='progressText').get_text(strip=True)
        proficiency = bar_wrapper.find_next('h3').get_text(strip=True)
        skills_data.append(f'{skill}: {proficiency}')
    
    return "\n".join(skills_data)

# Extract meaningful text
meaningful_text = extract_meaningful_text(soup)

# Extract skills and proficiency levels
skills_text = extract_skills(soup)

# Combine the meaningful text with the skills
combined_text = meaningful_text + "\n\nSkills:\n" + skills_text

# Print combined text before further processing
print("Combined Text Before Processing:\n", combined_text)

# Optional further processing with NLTK (if needed)
nltk.download('punkt')

# Sentence Tokenization
sentences = sent_tokenize(combined_text)

# Join the sentences back into a single string for use in the model
final_text = '\n'.join(sentences)

# Print the final processed text
print("Final Processed Text:\n", final_text)

# Optionally save the processed text
with open('processed_text_with_skills.txt', 'w', encoding='utf-8') as f:
    f.write(final_text)


Combined Text Before Processing:
 
education
skills
experience
projects
profiles
Sharath Kumar Reddy .
Deep Learning and AI Engineer
about me
Sharath Kumar Reddy is a highly skilled Data Scientist and AI researcher with a Master's degree in Data Science from the University of Houston, specializing in Artificial Intelligence and Machine Learning.
With a robust background in developing AI solutions and extensive experience in data analysis, Sharath has successfully deployed multiple AI solutions and conducted comprehensive EDA for various clients. Sharath's technical expertise spans a wide range of programming languages, tools, and machine learning techniques and Large Language Models. In addition to his professional accomplishments, he has led innovative projects and held leadership roles, showcasing his ability to drive impactful technological advancements.
Phone
+1 (346) 227-3943 / +91 957056365
Email
skrkapu@gmail.com



education
Aug 2023 - May 2025
Masters in Engineering Data Scien

[nltk_data] Downloading package punkt to /Users/sharath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def chunk_text(text, chunk_size=512):
    """Splits text into chunks of a specified size."""
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Load and preprocess your text
with open('processed_text_with_skills.txt', 'r') as file:
    text = file.read()

# Chunk the text
chunks = chunk_text(text)


In [6]:
chunks

["\neducation\nskills\nexperience\nprojects\nprofiles\nSharath Kumar Reddy .\nDeep Learning and AI Engineer\nabout me\nSharath Kumar Reddy is a highly skilled Data Scientist and AI researcher with a Master's degree in Data Science from the University of Houston, specializing in Artificial Intelligence and Machine Learning.\nWith a robust background in developing AI solutions and extensive experience in data analysis, Sharath has successfully deployed multiple AI solutions and conducted comprehensive EDA for various cli",
 "ents.\nSharath's technical expertise spans a wide range of programming languages, tools, and machine learning techniques and Large Language Models.\nIn addition to his professional accomplishments, he has led innovative projects and held leadership roles, showcasing his ability to drive impactful technological advancements.\nPhone\n+1 (346) 227-3943 / +91 957056365\nEmail\nskrkapu@gmail.com\n\n\n\neducation\nAug 2023 - May 2025\nMasters in Engineering Data Science\nC

In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

chunk_embeddings = get_embeddings(chunks)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
chunk_embeddings

array([[-0.20125815,  0.16343354,  0.09499156, ..., -0.12452382,
        -0.07311866, -0.11838534],
       [-0.17397119,  0.19378352,  0.17293273, ..., -0.12043826,
        -0.03148735, -0.00307511],
       [-0.14250673,  0.20955324,  0.34830558, ..., -0.24934298,
        -0.1118788 ,  0.09516998],
       ...,
       [-0.32565552, -0.16048494,  0.35779005, ..., -0.19769624,
         0.05380083, -0.20772623],
       [-0.17981175, -0.0547062 ,  0.4001052 , ..., -0.14094925,
        -0.0481856 , -0.04774047],
       [-0.35822657,  0.02969653,  0.03372635, ..., -0.30330613,
        -0.14249526,  0.16670467]], dtype=float32)

In [9]:
import faiss

index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
index.add(chunk_embeddings)


In [19]:
query = "Do you have experience in BERT models?"
query_embedding = get_embeddings([query])


In [20]:
D, I = index.search(query_embedding, k=1)
relevant_chunk = chunks[I[0][0]]


In [21]:
from transformers import pipeline

# Load a question-answering pipeline
qa_pipeline = pipeline("question-answering")

# Generate the answer from the relevant chunk
answer = qa_pipeline(question=query, context=relevant_chunk)

print(f"Answer: {answer['answer']}")


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Answer: m of 25 members organizers
