# Source of the Dataset

- https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('job_descriptions.csv', engine="python", on_bad_lines='warn')

# Drop unnecessary columns
df = df.drop(columns=['Job Id', 'Job Posting Date', 'latitude', 'longitude', 'Contact', 'Contact Person', 'Company Profile'])

# Extracting lower and upper bounds from the 'Salary Range' column
df[['Lower Salary', 'Upper Salary']] = df['Salary Range'].str.extract(r'\$(\d+)(?:K)?-\s*\$(\d+)(?:K)?')
df['Lower Salary'] = pd.to_numeric(df['Lower Salary']) * 1000
df['Upper Salary'] = pd.to_numeric(df['Upper Salary']) * 1000
df = df.drop(columns=['Salary Range'])

# Extracting experience
df[['Lower Experience', 'Upper Experience']] = df['Experience'].str.extract(r'(\d+)\s*to\s*(\d+)\s*Years')
df['Lower Experience'] = pd.to_numeric(df['Lower Experience'])
df['Upper Experience'] = pd.to_numeric(df['Upper Experience'])
df = df.drop(columns=['Experience'])


print(df.shape)
df.head()

(1615940, 18)


Unnamed: 0,Qualifications,location,Country,Work Type,Company Size,Preference,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Lower Salary,Upper Salary,Lower Experience,Upper Experience
0,M.Tech,Douglas,Isle of Man,Intern,26801,Female,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,59000,99000,5,15
1,BCA,Ashgabat,Turkmenistan,Intern,100340,Female,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,56000,116000,2,12
2,PhD,Macao,"Macao SAR, China",Temporary,84525,Male,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,61000,104000,0,12
3,PhD,Porto-Novo,Benin,Full-Time,129896,Female,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,65000,91000,4,11
4,MBA,Santiago,Chile,Intern,53944,Female,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,64000,87000,1,12


In [3]:
df.isnull().sum()

Qualifications      0
location            0
Country             0
Work Type           0
Company Size        0
Preference          0
Job Title           0
Role                0
Job Portal          0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
Lower Salary        0
Upper Salary        0
Lower Experience    0
Upper Experience    0
dtype: int64

In [4]:
df.dropna(axis = 1, inplace = True)
df.shape

(1615940, 18)

In [5]:
df.head()

Unnamed: 0,Qualifications,location,Country,Work Type,Company Size,Preference,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Lower Salary,Upper Salary,Lower Experience,Upper Experience
0,M.Tech,Douglas,Isle of Man,Intern,26801,Female,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,59000,99000,5,15
1,BCA,Ashgabat,Turkmenistan,Intern,100340,Female,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,56000,116000,2,12
2,PhD,Macao,"Macao SAR, China",Temporary,84525,Male,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,61000,104000,0,12
3,PhD,Porto-Novo,Benin,Full-Time,129896,Female,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,65000,91000,4,11
4,MBA,Santiago,Chile,Intern,53944,Female,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,64000,87000,1,12


In [6]:
# Separate Independent (X) and Dependent(y) features
# Define your target variable
target_column = 'Lower Salary'
X = df.drop(columns=[target_column])
y = df[target_column]

In [7]:
X.head()

Unnamed: 0,Qualifications,location,Country,Work Type,Company Size,Preference,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Upper Salary,Lower Experience,Upper Experience
0,M.Tech,Douglas,Isle of Man,Intern,26801,Female,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,99000,5,15
1,BCA,Ashgabat,Turkmenistan,Intern,100340,Female,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,116000,2,12
2,PhD,Macao,"Macao SAR, China",Temporary,84525,Male,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,104000,0,12
3,PhD,Porto-Novo,Benin,Full-Time,129896,Female,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,91000,4,11
4,MBA,Santiago,Chile,Intern,53944,Female,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,87000,1,12


In [13]:
# check if the dataset is balanced or imbalanced
print(y)
# y.value_counts()

0          59000
1          56000
2          61000
3          65000
4          64000
           ...  
1615935    64000
1615936    62000
1615937    60000
1615938    65000
1615939    56000
Name: Lower Salary, Length: 1615940, dtype: int64


In [8]:
# 357/(357+212)

0.6274165202108963

# HoldOut Validation Approach - Train-test split

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy ", accuracy_score(y_test, y_pred))

ValueError: could not convert string to float: 'BCA'

In [10]:
y_test.value_counts()

B    108
M     63
Name: diagnosis, dtype: int64

In [11]:
108/(108+63)

0.631578947368421

# Stratified Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0, stratify = y)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy ", accuracy_score(y_test, y_pred))
print(y_train.value_counts())

Accuracy  0.9122807017543859
B    250
M    148
Name: diagnosis, dtype: int64


In [13]:
250/(250+148)

0.628140703517588

# K Fold Cross Validation

In [14]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
kfold_validation = KFold(n_splits = 10, shuffle = True, random_state = 42)
clf = DecisionTreeClassifier()

results = cross_val_score(clf, X, y, cv = kfold_validation)
print(results)
print ("Results = ", np.mean(results), "+/-", np.std(results))



[0.9122807  0.92982456 0.94736842 0.9122807  0.9122807  0.89473684
 0.96491228 0.98245614 0.94736842 0.92857143]

Results =  0.9332080200501253 +/- 0.025803888599007355


# Stratified K Fold Cross Validation

In [15]:
# For imbalance dataset
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5)
clf = DecisionTreeClassifier()
results = cross_val_score(clf, X, y, cv = skfold)
print(results)
print()
print ("Results = ", np.mean(results), "+/-", np.std(results))


[0.9122807  0.90350877 0.92105263 0.92105263 0.90265487]

Results =  0.9121099208197485 +/- 0.008041779873646897


# Leave One Out Cross Validation (LOOCV)

In [16]:
# not recommended for large dataset
from sklearn.model_selection import LeaveOneOut
clf = DecisionTreeClassifier()
leave = LeaveOneOut()
results = cross_val_score(clf, X, y, cv = leave)
print(results)
print()
print ("Results = ", np.mean(results), "+/-", np.std(results))
print ("Length of Results = ", len(results))

[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

# Shuffle Split/ Repeated Random train-test splits

In [17]:
from sklearn.model_selection import ShuffleSplit
ssplit = ShuffleSplit(n_splits = 10, test_size = 0.30)
clf = DecisionTreeClassifier()
results = cross_val_score(clf, X, y, cv = ssplit)
print(results)
print()
print ("Results = ", np.mean(results), "+/-", np.std(results))

[0.94152047 0.92982456 0.94152047 0.9005848  0.94736842 0.94736842
 0.92982456 0.93567251 0.92397661 0.92982456]

Results =  0.9327485380116958 +/- 0.01314164038259897


# Implementing Shuffle Split without importing Shuffle Split

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

rs = [10, 42, 25, 64, 39]
train_acc = []
test_acc = []
for seed in rs:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = seed)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    test_acc.append(accuracy_score(y_test, y_pred))
    train_acc.append(accuracy_score(y_train, clf.predict(X_train)))
print('Train Accuracy: ', np.mean(train_acc), '+/-', np.std(train_acc))
print('Test Accuracy: ', np.mean(test_acc), '+/-', np.std(test_acc))

Train Accuracy:  1.0 +/- 0.0
Test Accuracy:  0.9228070175438596 +/- 0.019360170008475824
