In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('job_descriptions.csv', engine="python", on_bad_lines='warn')

# Drop unnecessary columns
df = df.drop(columns=['latitude', 'longitude', 'Contact', 'Contact Person'])







In [8]:
# Extracting lower and upper bounds from the 'Salary Range' column
df[['Lower Salary', 'Upper Salary']] = df['Salary Range'].str.extract(r'\$(\d+)(?:K)?-\s*\$(\d+)(?:K)?')
df['Lower Salary'] = pd.to_numeric(df['Lower Salary']) * 1000
df['Upper Salary'] = pd.to_numeric(df['Upper Salary']) * 1000
df = df.drop(columns=['Salary Range'])

# Extracting experience
df[['Lower Experience', 'Upper Experience']] = df['Experience'].str.extract(r'(\d+)\s*to\s*(\d+)\s*Years')
df['Lower Experience'] = pd.to_numeric(df['Lower Experience'])
df['Upper Experience'] = pd.to_numeric(df['Upper Experience'])
df = df.drop(columns=['Experience'])



In [10]:
# Handle missing values
df = df.dropna()

In [11]:
print(df.head())
print(df.shape)

             Job Id Qualifications    location           Country  Work Type  \
0  1089843540111562         M.Tech     Douglas       Isle of Man     Intern   
1   398454096642776            BCA    Ashgabat      Turkmenistan     Intern   
2   481640072963533            PhD       Macao  Macao SAR, China  Temporary   
3   688192671473044            PhD  Porto-Novo             Benin  Full-Time   
4   117057806156508            MBA    Santiago             Chile     Intern   

   Company Size Job Posting Date Preference                     Job Title  \
0         26801       2022-04-24     Female  Digital Marketing Specialist   
1        100340       2022-12-19     Female                 Web Developer   
2         84525       2022-09-14       Male            Operations Manager   
3        129896       2023-02-25     Female              Network Engineer   
4         53944       2022-10-11     Female                 Event Manager   

                        Role  ...  \
0       Social Media Mana

In [13]:
# Label Encoding for categorical variables
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])



In [14]:
# Define your target variable
target_column = 'Lower Salary'

# Split the data into features and target
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0, stratify = y)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy ", accuracy_score(y_test, y_pred))
print(y_train.value_counts())

# Feature scaling
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

Accuracy  0.09064472129138819
Lower Salary
65000    102967
55000    102835
64000    102665
59000    102661
62000    102584
61000    102569
58000    102412
60000    102319
63000    102213
56000    102203
57000    101895
Name: count, dtype: int64


In [17]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor


kfold_validation = KFold(n_splits = 10, shuffle = True, random_state = 42)
# Initialize the Random Forest Regressor
clf = RandomForestRegressor(n_estimators=100, random_state=42) 
# clf = DecisionTreeClassifier()

results = cross_val_score(clf, X, y, cv = kfold_validation)
print(results)
print ("Results = ", np.mean(results), "+/-", np.std(results))

In [None]:
# Initialize and train the Gradient Boosting Regressor
# model = GradientBoostingRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)

# # Calculate Mean Squared Error and R-squared for evaluation
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Print the results
# print(f"Mean Squared Error: {mse}")
# print(f"R-squared: {r2}")

In [16]:
# For imbalance dataset
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5)
clf = DecisionTreeClassifier()
results = cross_val_score(clf, X, y, cv = skfold)
print(results)
print()
print ("Results = ", np.mean(results), "+/-", np.std(results))

[0.09027517 0.09171575 0.0903779  0.09137451 0.09100195]

Results =  0.09094905679686818 +/- 0.0005571292121757966
