In [None]:
!pip install pdfplumber spacy python-docx pandas openpyxl
!python -m spacy download en_core_web_sm


In [6]:
import pdfplumber
import spacy
import pandas as pd
import os
from datetime import datetime
import docx
import re

# Load the spacy model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
dataset_path = '/content/Dataset.xlsx'
df = pd.read_excel(dataset_path)

# Print the column names to understand the structure
print(df.columns)

# Split the dataframe into companies and universities
top_companies = df[df['Type'] == 'Company']
top_universities = df[df['Type'] == 'University']

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + " "
    return text

# Function to extract companies and universities from text
def extract_entities(text):
    doc = nlp(text)
    companies = []
    universities = []

    # Debugging: Print extracted text
    print(f"Extracted Text: {text[:2000]}")

    # Extract entities using spacy
    for ent in doc.ents:
        if ent.label_ == "ORG":
            for _, row in top_companies.iterrows():
                if re.search(re.escape(row['Company Name']), ent.text, re.IGNORECASE):
                    companies.append((row['Company Name'], row['Fortune 500']))
            for _, row in top_universities.iterrows():
                if re.search(re.escape(row['University Name']), ent.text, re.IGNORECASE):
                    universities.append((row['University Name'], row['University Ranking']))

    # Regex matching as a fallback
    for _, row in top_companies.iterrows():
        if re.search(re.escape(row['Company Name']), text, re.IGNORECASE):
            if (row['Company Name'], row['Fortune 500']) not in companies:
                companies.append((row['Company Name'], row['Fortune 500']))

    for _, row in top_universities.iterrows():
        if re.search(re.escape(row['University Name']), text, re.IGNORECASE):
            if (row['University Name'], row['University Ranking']) not in universities:
                universities.append((row['University Name'], row['University Ranking']))

    return companies, universities

# Function to extract years of experience
def extract_years_of_experience(text):
    doc = nlp(text)
    current_year = datetime.now().year
    years = []

    for ent in doc.ents:
        if ent.label_ == "DATE":
            try:
                year = int(ent.text.strip())
                if 1950 <= year <= current_year:
                    years.append(year)
            except ValueError:
                continue

    if len(years) >= 2:
        total_experience = max(years) - min(years)
    else:
        total_experience = 0
    return total_experience

# Function to rank CV based on extracted information
def rank_cv(companies, universities, experience):
    company_score = sum([int(company[1]) for company in companies if pd.notnull(company[1])])
    university_score = sum([int(university[1]) for university in universities if pd.notnull(university[1])])
    experience_score = experience
    return company_score + university_score + experience_score

# Function to process a single CV
def process_cv(file_path):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    companies, universities = extract_entities(text)
    experience = extract_years_of_experience(text)
    rank = rank_cv(companies, universities, experience)

    # Print extracted details for debugging
    print(f"Processed CV: {file_path}")
    print(f"Extracted Companies: {companies}")
    print(f"Extracted Universities: {universities}")
    print(f"Extracted Experience: {experience} years")
    print(f"Rank: {rank}\n")

    return {
        "companies": companies,
        "universities": universities,
        "experience": experience,
        "rank": rank
    }

# Function to process all CVs in a directory
def process_all_cvs(cv_directory):
    cv_rankings = []
    for cv_file in os.listdir(cv_directory):
        if cv_file.endswith(".pdf") or cv_file.endswith(".docx"):
            cv_data = process_cv(os.path.join(cv_directory, cv_file))
            cv_rankings.append((cv_file, cv_data))

    # Sort CVs by rank
    cv_rankings.sort(key=lambda x: x[1]["rank"], reverse=True)
    return cv_rankings


cv_directory = "/content/CVs"
ranked_cvs = process_all_cvs(cv_directory)

for cv in ranked_cvs:
    print(f"CV: {cv[0]}, Rank: {cv[1]['rank']}, Experience: {cv[1]['experience']} years")


Index(['Company Name', 'Company Website', 'Company Linkedin Page',
       'Company Size', 'Type', 'Country Origion', 'Region of Origin',
       'Company Culture', 'Industry', 'Fortune 500', 'G2K',
       'Startup/Multinational', 'University Name', 'University  Type',
       'University  Size', 'University   Ranking'],
      dtype='object')
Extracted Text: AYESHA AMEEN
Software Engineer
CONTACT SUMMARY
Phone Seeking a software engineer position in a dynamic and innovative company
+91 1234567890 where I can utilize my technical skills and knowledge to contribute to the growth
of the organization. Professional highlights include:
Location
• 5+ years of experience in software development and programming
Navi Mumbai, India
• Strong expertise in Java, Python, and SQL
Email
• Knowledge of software development methodologies such as
anisa.patel@email.com Agile and Scrum
• Familiarity with web technologies such as HTML, CSS, and JavaScript
LinkedIn
linkedin.com/in/anisa.patel/ • Good understandi