In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix
from sklearn.metrics import accuracy_score , confusion_matrix, classification_report

In [52]:
df = pd.read_csv('jobs.csv')

In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,Job Salary,Job Experience Required,Key Skills,Role Category,Functional Area,Industry,Job Title
0,0,Not Disclosed by Recruiter,5 - 10 yrs,Media Planning| Digital Media,Advertising,"Marketing , Advertising , MR , PR , Media Plan...","Advertising, PR, MR, Event Management",Media Planning Executive/Manager
1,1,Not Disclosed by Recruiter,2 - 5 yrs,pre sales| closing| software knowledge| clien...,Retail Sales,"Sales , Retail , Business Development","IT-Software, Software Services",Sales Executive/Officer
2,2,Not Disclosed by Recruiter,0 - 1 yrs,Computer science| Fabrication| Quality check|...,R&D,"Engineering Design , R&D","Recruitment, Staffing",R&D Executive
3,3,"2,00,000 - 4,00,000 PA.",0 - 5 yrs,Technical Support,Admin/Maintenance/Security/Datawarehousing,"IT Software - Application Programming , Mainte...","IT-Software, Software Services",Technical Support Engineer
4,4,Not Disclosed by Recruiter,2 - 5 yrs,manual testing| test engineering| test cases|...,Programming & Design,IT Software - QA & Testing,"IT-Software, Software Services",Testing Engineer


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27010 entries, 0 to 27009
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               27010 non-null  int64 
 1   Job Salary               27010 non-null  object
 2   Job Experience Required  27010 non-null  object
 3   Key Skills               27010 non-null  object
 4   Role Category            27010 non-null  object
 5   Functional Area          27010 non-null  object
 6   Industry                 27010 non-null  object
 7   Job Title                27010 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


In [55]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,27010.0
mean,14973.19726
std,8661.925267
min,0.0
25%,7474.25
50%,14913.5
75%,22476.5
max,29999.0


In [56]:
df.isnull().sum()

Unnamed: 0                 0
Job Salary                 0
Job Experience Required    0
Key Skills                 0
Role Category              0
Functional Area            0
Industry                   0
Job Title                  0
dtype: int64

In [57]:
df["education"] = df["Functional Area"].apply(assign_education)

In [58]:
df = df.rename(columns={
    "Functional Area": "domain",
    "Key Skills": "skills",
    "Job Experience Required": "experience",
    "Job Title": "job_role"
})


In [59]:
df["education"] = df["domain"].apply(assign_education)


In [60]:
def assign_education(domain):
    domain = str(domain).lower()

    if "software" in domain or "it" in domain or "programming" in domain:
        return "B.Tech / BCA / MCA"
    elif "data" in domain or "ai" in domain or "ml" in domain:
        return "B.Tech / M.Tech / MSc"
    elif "marketing" in domain or "sales" in domain:
        return "MBA / BBA"
    elif "support" in domain:
        return "Any Graduate"
    else:
        return "Any Graduate"

df["education"] = df["domain"].apply(assign_education)


In [61]:
print(df.columns)


Index(['Unnamed: 0', 'Job Salary', 'experience', 'skills', 'Role Category',
       'domain', 'Industry', 'job_role', 'education'],
      dtype='object')


In [62]:
final_df = df[[
    "skills",
    "education",
    "experience",
    "domain",
    "job_role"
]]


In [63]:
final_df.to_csv("cleaned_data.csv", index=False)


In [64]:
final_df.head()

Unnamed: 0,skills,education,experience,domain,job_role
0,Media Planning| Digital Media,MBA / BBA,5 - 10 yrs,"Marketing , Advertising , MR , PR , Media Plan...",Media Planning Executive/Manager
1,pre sales| closing| software knowledge| clien...,B.Tech / M.Tech / MSc,2 - 5 yrs,"Sales , Retail , Business Development",Sales Executive/Officer
2,Computer science| Fabrication| Quality check|...,Any Graduate,0 - 1 yrs,"Engineering Design , R&D",R&D Executive
3,Technical Support,B.Tech / BCA / MCA,0 - 5 yrs,"IT Software - Application Programming , Mainte...",Technical Support Engineer
4,manual testing| test engineering| test cases|...,B.Tech / BCA / MCA,2 - 5 yrs,IT Software - QA & Testing,Testing Engineer


In [65]:
final_df.isnull().sum()

skills        0
education     0
experience    0
domain        0
job_role      0
dtype: int64

In [66]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [67]:
for col in ["skills", "education", "domain"]:
    final_df[col] = final_df[col].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[col] = final_df[col].apply(clean_text)


In [68]:
final_df["combined_text"] = (
    final_df["skills"] + " " +
    final_df["education"] + " " +
    final_df["domain"]
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["combined_text"] = (


In [69]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(final_df["combined_text"])


In [70]:
similarity_matrix = cosine_similarity(tfidf_matrix)


In [71]:
def recommend_jobs(user_skills, user_education, user_experience, user_domain, top_n=5):
    
    user_text = clean_text(
        user_skills + " " +
        user_education + " " +
        user_domain
    )
    
    user_vector = tfidf.transform([user_text])
    
    similarities = cosine_similarity(user_vector, tfidf_matrix)[0]
    
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    return final_df.iloc[top_indices][["job_role", "domain"]]


In [72]:
recommend_jobs(
    user_skills="python machine learning pandas",
    user_education="btech computer science",
    user_experience="2 years",
    user_domain="data science"
)


Unnamed: 0,job_role,domain
10553,Software Developer,it software application programming maintenance
5337,Software Developer,it software application programming maintenance
22132,Business Analyst,it software other
21116,Software Developer,it software application programming maintenance
4872,Associate/Senior Associate -(Technical),ites bpo kpo lpo customer service operations
