In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('job_descriptions.csv', engine="python", on_bad_lines='warn')

# Drop unnecessary columns
df = df.drop(columns=['latitude', 'longitude', 'Contact', 'Contact Person'])







In [None]:
# Extracting lower and upper bounds from the 'Salary Range' column
df[['Lower Salary', 'Upper Salary']] = df['Salary Range'].str.extract(r'\$(\d+)(?:K)?-\s*\$(\d+)(?:K)?')
df['Lower Salary'] = pd.to_numeric(df['Lower Salary']) * 1000
df['Upper Salary'] = pd.to_numeric(df['Upper Salary']) * 1000
df = df.drop(columns=['Salary Range'])

# Extracting experience
df[['Lower Experience', 'Upper Experience']] = df['Experience'].str.extract(r'(\d+)\s*to\s*(\d+)\s*Years')
df['Lower Experience'] = pd.to_numeric(df['Lower Experience'])
df['Upper Experience'] = pd.to_numeric(df['Upper Experience'])
df = df.drop(columns=['Experience'])



In [None]:
# Handle missing values
df = df.dropna()

In [2]:
print(df.head())
print(df.shape)

             Job Id  Qualifications  location  Country  Work Type  \
0  1089843540111562               6        59       92          2   
1   398454096642776               4        11      198          2   
2   481640072963533               9       102      114          4   
3   688192671473044               9       152       20          1   
4   117057806156508               7       172       39          2   

   Company Size  Job Posting Date  Preference  Job Title  Role  ...  \
0         26801               221           1         31   312  ...   
1        100340               460           1        145   147  ...   
2         84525               364           2         90   273  ...   
3        129896               528           1         83   375  ...   
4         53944               391           1         38    60  ...   

   Job Description  Benefits  skills  Responsibilities  Company  \
0              333         4     307               218      425   
1              221      

In [None]:
# Label Encoding for categorical variables
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])



In [None]:
# Define your target variable
target_column = 'Lower Salary'

# Split the data into features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize and train the Gradient Boosting Regressor
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error and R-squared for evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")