In [2]:
!pip install spacy pandas
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

# Load your CSV resume file
df = pd.read_csv("/content/MUKILAN_KT.csv")

# Combine all text columns into a single string (assuming resume is spread across columns)
resume_text = ' '.join(df.astype(str).values.flatten())

# Process the resume text
doc = nlp(resume_text)

# Extract named entities for Education, Organizations (work), and Skills (noun chunks)
education_keywords = ["B.Sc", "B.E", "B.Tech", "M.Sc", "M.Tech", "MBA", "Bachelor", "Master", "PhD"]
education = [ent.text for ent in doc.ents if ent.label_ == "EDUCATION" or any(kw in ent.text for kw in education_keywords)]

experience = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "DATE", "GPE"]]

skills = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]

# Clean and deduplicate
education = list(set(education))
experience = list(set(experience))
skills = list(set(skills))

# Display extracted information
print("📘 Education:", education)
print("💼 Experience:", experience)
print("🛠️ Skills:", skills[:10])  # Just showing top 10 skills


📘 Education: ['Bachelor of Engineering']
💼 Experience: ['2019', 'GitHub', 'nan nan', 'Programming & Tools: Python', 'Tableau', 'ANN', 'Emotion Recognition Using', 'nan nan Power BI', 'AI', 'CERTIFICATIONS', 'Random Forest', 'AI/ML', 'NumPy', 'Coimbatore', '2020', 'Data Science and Analytics', 'SVM', '2014', 'Scikit-Learn', 'nan \uf0b7  Statistics & Data Analytics: Hypothesis Testing, Normal Distribution', 'National Amateur Nitro Buggy', 'Conducted', 'Post Graduate Program', 'Cambridge Business English Certificate', 'Strong', 'Sri Krishna College of Technology', 'NLP Fundamentals', 'Keras', 'nan Car Sales Analytics Using', 'Python, Power BI', 'Hadoop', 'nan TECHNICAL SKILLS  ', 'nan nan Post Graduate in Data Science and Analytics with Advanced ML  nan nan', 'TensorFlow', 'PCA', 'Primary Healthcare nan Centers', 'Machine', '2016', 'Data Visualization: Power BI', 'nan Indian Healthcare Analysis  nan', 'Naïve Bayes', 'LinkedIn', 'Bachelor of Engineering', 'Data Visualization', 'Learning, D

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Sample Resume (already parsed or raw text)

resume_text = """
Experienced data analyst with skills in Python, SQL, Power BI, and Machine Learning.
Worked on logistics, supply chain optimization, and dashboard reporting.
Education: B.Tech in Information Technology.
"""


# Sample Job Descriptions (You can replace this with your job CSV)

jobs = {
    'Job Title': [
        'Data Analyst',
        'Data Scientist',
        'Business Analyst',
        'ML Engineer'
    ],
    'Job Description': [
        'Looking for a data analyst skilled in SQL, Power BI, and Python for reporting dashboards.',
        'Build ML models in Python. Experience with TensorFlow, Scikit-learn required.',
        'Develop dashboards using Power BI. Strong in SQL and data modeling.',
        'Manage logistics operations. Experience with supply chain and optimization tools.'
    ]
}

job_df = pd.DataFrame(jobs)


# TF-IDF Vectorization

# Add the resume as one more "job" for comparison
texts = job_df['Job Description'].tolist() + [resume_text]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)

# Compute cosine similarity between resume and all jobs
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Add similarity score to dataframe
job_df['Matching Score'] = cosine_sim[0]

# Sort by best match
recommended_jobs = job_df.sort_values(by='Matching Score', ascending=False)


# Output Recommendations

print("🎯 Top Job Matches:")
print(recommended_jobs[['Job Title', 'Matching Score']])


🎯 Top Job Matches:
          Job Title  Matching Score
0      Data Analyst        0.354365
3       ML Engineer        0.259075
2  Business Analyst        0.175859
1    Data Scientist        0.039218


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# -----------------------------
# Sample Labeled Job Dataset
# -----------------------------
# You can replace this with a larger labeled dataset
data = {
    'Job Description': [
        'Analyze sales data using SQL and Power BI.',
        'Build machine learning models using Python and TensorFlow.',
        'Develop dashboards and visualizations using Power BI and Excel.',
        'Manage logistics and optimize supply chain operations.',
        'Perform statistical analysis and build predictive models.',
        'Design deep learning models for image recognition tasks.'
    ],
    'Category': [
        'Data Analyst',
        'ML Engineer',
        'BI Developer',
        'Logistics Manager',
        'Data Scientist',
        'ML Engineer'
    ]
}

df = pd.DataFrame(data)


# Text Vectorization

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Job Description'])


# Train/Test Split

y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Train Classifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)





In [18]:
from sklearn.metrics import classification_report

# Assume y_test and y_pred from a classifier
# For example:
y_test = ['Data Analyst', 'ML Engineer', 'BI Developer']
y_pred = ['Data Analyst', 'Data Analyst', 'BI Developer']

print("📊 Classification Metrics:")
print(classification_report(y_test, y_pred))


📊 Classification Metrics:
              precision    recall  f1-score   support

BI Developer       1.00      1.00      1.00         1
Data Analyst       0.50      1.00      0.67         1
 ML Engineer       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
