In [None]:
!pip install PyPDF2
!pip install openai
!pip install singlestoredb



In [None]:
import PyPDF2
import pandas as pd
import singlestoredb as db
from openai import OpenAI
import os
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Setup the api key
os.environ['OPENAI_API_KEY']=""
client = OpenAI()

In [None]:
#Function to connect to Singlestoredb
def connector():
    return db.connect(host='hostname', port='3333', user='username',
                  password='password', database='database_name')

In [None]:
# Function to generate embeddings using OpenAI's API
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    #response = OpenAI.Embed.create(inputs=[text], model=model)
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
#Function to extract text from pdfs, not spliting the text into paragraphs because not all resumes may follow the same format. Processing the text as a whole is more appropriate in this case.
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page= pdf_reader.pages[page_num]
            text += page.extract_text()
    return text


def segment_text_into_paragraphs(text):
    # Split text into paragraphs based on some criteria (e.g., newline characters)
    paragraphs = text.split('\n\n')  # Splitting based on double newline
    return paragraphs

In [None]:
# Function to loop through folder and process PDF files
def process_resume_folder(folder_path):
  id=0
  try:
    conn = connector()
    c = conn.cursor()

    c.execute('''CREATE TABLE IF NOT EXISTS resumes
                 (filename TEXT, content TEXT,vector BLOB)''')

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            print(filename)
            resume_text = extract_text_from_pdf(full_path)
            resume_text = resume_text.replace("\n", " ")
            embedding = get_embedding(resume_text)
            sql = 'INSERT INTO resumes (filename, content, vector) values (%s, %s, JSON_ARRAY_PACK(%s))'
            c.executemany(sql, [(filename, resume_text, str(embedding))])


  except Exception as e:
        print(e)
        pass
  try:
    id=c.lastrowid
    conn.commit()
    conn.close()
  except:
    pass

  return id


In [None]:
# Loading values to database
folder_path = '/content/drive/MyDrive/GenAI/Resumes/INFORMATION-TECHNOLOGY'
process_resume_folder(folder_path)


11580408.pdf
11584809.pdf
10839851.pdf
10840430.pdf
10265057.pdf
10089434.pdf
10641230.pdf
10553553.pdf
10247517.pdf
11957080.pdf
22776912.pdf
21780877.pdf
20024870.pdf
20237244.pdf
19850482.pdf
21283365.pdf
20408458.pdf
20824105.pdf
20674668.pdf
20879311.pdf
18176523.pdf
19201175.pdf
18187364.pdf
18301617.pdf
18752129.pdf
20001721.pdf
19796840.pdf
18159866.pdf
17987433.pdf
17688766.pdf
17111768.pdf
17641670.pdf
16899268.pdf
16186411.pdf
18067556.pdf
17681064.pdf
16533554.pdf
15297298.pdf
15791766.pdf
15802627.pdf
15651486.pdf
14789139.pdf
13405733.pdf
13477922.pdf
13836471.pdf
15118506.pdf
13385306.pdf
12635195.pdf
12763627.pdf
12045067.pdf
12334140.pdf
37242217.pdf
35325329.pdf
36856210.pdf
39413067.pdf
39718499.pdf
40018190.pdf
37764298.pdf
38753827.pdf
31111279.pdf
30223363.pdf
31243710.pdf
33241454.pdf
32959732.pdf
36434348.pdf
33381211.pdf
29975124.pdf
29075857.pdf
28897981.pdf
28672970.pdf
27536013.pdf
28126340.pdf
27770859.pdf
28035460.pdf
29051656.pdf
28697203.pdf
27485716.pdf

0

In [None]:
def read_vectors(vector):
    output = []
    try:
        mydb = connector()
        mycursor = mydb.cursor()
        sql = "SELECT filename, content, dot_product(json_array_pack(%s), vector) as score, JSON_ARRAY_UNPACK(vector) as vector FROM resumes order by score desc limit 5"
        mycursor.execute(sql, (str(vector)))
        result = mycursor.fetchall()
        # Convert the result to a DataFrame
        columns = ['Filename', 'Content', 'Score', 'Vector']
        df = pd.DataFrame(result, columns=columns)

        mydb.close()
    except Exception as e:
        print(e)
    return df

In [None]:
query = input("Enter your search query: ")
vector = get_embedding(query)
output=read_vectors(vector)
output

Enter your search query: Analyst with Python


Unnamed: 0,Filename,Content,Score,Vector
0,30223363.pdf,BUSINESS SYSTEMS ANALYST I Qualifications TECH...,0.318521,"[-0.0214784425, 0.0241680499, 0.0549192354, -0..."
1,10265057.pdf,WORKING RF SYSTEMS ENGINEER Qualifications Mic...,0.304522,"[-0.00601587072, 0.039931722, 0.0441632532, -0..."
2,52618188.pdf,INFORMATION TECHNOLOGY HELP DESK SPECIALIST Hi...,0.300312,"[-0.0387728512, -0.0117663993, 0.0615473166, 0..."
3,29051656.pdf,INFORMATION TECHNOLOGY SPECIALIST Summary An o...,0.292471,"[0.00492750853, 0.0214865319, 0.0986539125, 0...."
4,19796840.pdf,INFORMATION TECHNOLOGY AUDITOR Skills PeopleSo...,0.290831,"[-0.0310463943, 0.00547858141, 0.0737944841, 0..."


In [None]:
def drop_table(table_name):
    try:
        mydb = connector()
        mycursor = mydb.cursor()
        sql = f"DROP TABLE IF EXISTS {table_name}"
        mycursor.execute(sql)
        mydb.commit()
        mydb.close()
        print(f"Table {table_name} dropped successfully.")
    except Exception as e:
        print(f"Error dropping table {table_name}: {e}")

# Example usage
#drop_table('resumes')

Table resumes dropped successfully.
