In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv ("job_descriptions.csv")

In [5]:
print(df.columns)

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')


In [6]:
data = df.head(1000)

In [7]:
print(df.shape)

(1615940, 23)


In [8]:
print(data.shape)

(1000, 23)


In [9]:
data.isnull().sum()

Job Id              0
Experience          0
Qualifications      0
Salary Range        0
location            0
Country             0
latitude            0
longitude           0
Work Type           0
Company Size        0
Job Posting Date    0
Preference          0
Contact Person      0
Contact             0
Job Title           0
Role                0
Job Portal          0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
Company Profile     2
dtype: int64

In [10]:
data = data.dropna(subset=['Company Profile'])


In [11]:
print(data.shape)

(998, 23)


In [12]:
data.isnull().sum()

Job Id              0
Experience          0
Qualifications      0
Salary Range        0
location            0
Country             0
latitude            0
longitude           0
Work Type           0
Company Size        0
Job Posting Date    0
Preference          0
Contact Person      0
Contact             0
Job Title           0
Role                0
Job Portal          0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
Company Profile     0
dtype: int64

In [13]:
features = [
    'Job Title',
    'Salary Range',
    'location',
    'Work Type',
    'Company'
]


In [14]:
data=data[features]

In [15]:
print(data.columns)

Index(['Job Title', 'Salary Range', 'location', 'Work Type', 'Company'], dtype='object')


In [16]:
print(data["Salary Range"].head(10))

0     $59K-$99K
1    $56K-$116K
2    $61K-$104K
3     $65K-$91K
4     $64K-$87K
5     $59K-$93K
6    $63K-$103K
7    $65K-$102K
8    $65K-$102K
9     $60K-$80K
Name: Salary Range, dtype: object


In [17]:
import re

def usd_k_range_to_india_inr(salary):
    if not isinstance(salary, str):
        return None

    # Extract numbers: ['59', '99']
    nums = re.findall(r'\d+', salary)

    if len(nums) < 2:
        return None

    avg_k = (int(nums[0]) + int(nums[1])) / 2

    # Map to Indian LPA (realistic bands)
    if avg_k <= 50:
        lpa = 5
    elif avg_k <= 65:
        lpa = 7
    elif avg_k <= 80:
        lpa = 10
    elif avg_k <= 95:
        lpa = 14
    elif avg_k <= 110:
        lpa = 18
    else:
        lpa = 22

    return int(lpa * 100000)


In [18]:
data['salary_inr'] = data['Salary Range'].apply(usd_k_range_to_india_inr)


In [19]:
data[['Salary Range', 'salary_inr']].head(10)


Unnamed: 0,Salary Range,salary_inr
0,$59K-$99K,1000000
1,$56K-$116K,1400000
2,$61K-$104K,1400000
3,$65K-$91K,1000000
4,$64K-$87K,1000000
5,$59K-$93K,1000000
6,$63K-$103K,1400000
7,$65K-$102K,1400000
8,$65K-$102K,1400000
9,$60K-$80K,1000000


In [20]:
data = data.drop(columns=['Salary Range'])


In [21]:
print(data.columns)

Index(['Job Title', 'location', 'Work Type', 'Company', 'salary_inr'], dtype='object')


In [22]:
print(data["Work Type"].unique())

['Intern' 'Temporary' 'Full-Time' 'Contract' 'Part-Time']


In [23]:
data['Work Type'] = (
    data['Work Type']
    .str.lower()
    .str.strip()
)


In [24]:
work_type_map = {
    'intern': 'internship',
    'full-time': 'full-time',
    'part-time': 'part-time',
    'contract': 'contract',
    'temporary': 'temporary'
}

data['Work Type'] = data['Work Type'].replace(work_type_map)


In [25]:
data['Work Type'].value_counts()


Work Type
part-time     219
temporary     199
contract      198
full-time     193
internship    189
Name: count, dtype: int64

In [26]:
# Clean text
text_cols = ['Job Title', 'location', 'Work Type', 'Company']
for col in text_cols:
    data[col] = data[col].str.strip().str.lower()

# Save preprocessed data
data.to_csv('dataset/preprocessed_jobs.csv', index=False)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Job Title'])


In [28]:
print(data["Work Type"].unique())

['internship' 'temporary' 'full-time' 'contract' 'part-time']


In [30]:
print(data["salary_inr"].unique())

[1000000 1400000 1800000]
