In [2]:
import pandas as pd
import re
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import wandb


In [3]:
data_path = "SoftwareEngineer_cleaned.csv"
df = pd.read_csv(data_path)
print(df.shape)
df.head()

(34012, 12)


Unnamed: 0,title,level,company,yearsofexperience,totalyearlycompensation,gender,Doctorate_Degree,Masters_Degree,Bachelors_Degree,tag,state,city
0,Software Engineer,SE 2,eBay,5.0,100000,,0,0,0,,CA,San Francisco
1,Software Engineer,60,Microsoft,5.0,157000,,0,0,0,,CA,Mountain View
2,Software Engineer,63,Microsoft,8.5,208000,,0,0,0,,WA,Seattle
3,Software Engineer,62,Microsoft,4.0,156000,,0,0,0,,WA,Seattle
4,Software Engineer,59,Microsoft,3.0,120000,,0,0,0,,WA,Redmond


In [4]:
headers1 = ["company", "yearsofexperience", "Doctorate_Degree", "Masters_Degree", "Bachelors_Degree", "state", "city", "tag", "gender", 'level', "totalyearlycompensation"]
cat = ["company", "state", "city", "tag", "gender", 'level']
df_clean = df[headers1]
df_clean

Unnamed: 0,company,yearsofexperience,Doctorate_Degree,Masters_Degree,Bachelors_Degree,state,city,tag,gender,level,totalyearlycompensation
0,eBay,5.0,0,0,0,CA,San Francisco,,,SE 2,100000
1,Microsoft,5.0,0,0,0,CA,Mountain View,,,60,157000
2,Microsoft,8.5,0,0,0,WA,Seattle,,,63,208000
3,Microsoft,4.0,0,0,0,WA,Seattle,,,62,156000
4,Microsoft,3.0,0,0,0,WA,Redmond,,,59,120000
...,...,...,...,...,...,...,...,...,...,...,...
34007,Google,10.0,0,0,0,WA,Seattle,Distributed Systems (Back-End),,T4,327000
34008,Microsoft,2.0,0,0,0,WA,Redmond,Full Stack,,62,237000
34009,MSFT,14.0,0,0,0,WA,Seattle,Full Stack,,63,220000
34010,Salesforce,8.0,0,0,0,CA,San Francisco,iOS,,Lead MTS,280000


In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_clean[cat]), index=df_clean.index)

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(cat)
encode_df.head()

Unnamed: 0,company_10x Genomics,company_23andMe,company_2U,company_3M,company_3m,company_7-Eleven,company_7-eleven,company_8x8,company_ABB,company_ADP,...,level_unknown,level_vice president,level_x2,level_x4,level_x5,level_x6,level_x7,level_z4,level_||,level_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_clean = df_clean.merge(encode_df,left_index=True, right_index=True)
df_clean = df_clean.drop(cat,1)
df_clean# Merge one-hot encoded features and drop the originals
df_clean

  df_clean = df_clean.drop(cat,1)


Unnamed: 0,yearsofexperience,Doctorate_Degree,Masters_Degree,Bachelors_Degree,totalyearlycompensation,company_10x Genomics,company_23andMe,company_2U,company_3M,company_3m,...,level_unknown,level_vice president,level_x2,level_x4,level_x5,level_x6,level_x7,level_z4,level_||,level_nan
0,5.0,0,0,0,100000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,0,0,0,157000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.5,0,0,0,208000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0,0,0,156000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0,0,0,120000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34007,10.0,0,0,0,327000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34008,2.0,0,0,0,237000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34009,14.0,0,0,0,220000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34010,8.0,0,0,0,280000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Split our preprocessed data into our features and target arrays
y = df_clean.totalyearlycompensation
X = df_clean.drop(columns=["totalyearlycompensation"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.8)

In [8]:
X_train = torch.from_numpy(X_train.values).float()
y_train = torch.from_numpy(y_train.values).float()
X_test = torch.from_numpy(X_test.values).float()
y_test = torch.from_numpy(y_test.values).float()

In [9]:
wandb.init(project="XGb-regression")
xgb = XGBRegressor(n_estimators=200, max_depth=9, eta=0.175, subsample=0.6, colsample_bytree=0.85)

for i in range(0, 200, 10):
    xgb.set_params(n_estimators=i)
    xgb.fit(X_train, y_train)

    # Make predictions using the trained model on the training and test data
    y_train_pred = xgb.predict(X_train)
    y_test_pred = xgb.predict(X_test)

    # Calculate the R2 and MAPE on the training and test data
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
    mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
    # Log the results to WandB
    wandb.log({"Training R2": r2_train, "Test R2": r2_test,
               "Training MAPE": mape_train, "Test MAPE": mape_test})

    # Print the results
    print("Iteration: ", i)
    print("Training R2: ", r2_train)
    print("Test R2: ", r2_test)
    print("Training MAPE: ", mape_train)
    print("Test MAPE: ", mape_test)

[34m[1mwandb[0m: Currently logged in as: [33mpedi4tor[0m ([33maicyber[0m). Use [1m`wandb login --relogin`[0m to force relogin


Iteration:  0
Training R2:  -3.9147683354527327
Test R2:  -3.8877100353686496
Training MAPE:  0.99999726
Test MAPE:  0.99999726
Iteration:  10
Training R2:  0.5296156907381142
Test R2:  0.4981051151544408
Training MAPE:  0.20869137
Test MAPE:  0.21544084
Iteration:  20
Training R2:  0.6953733106723645
Test R2:  0.6468884434288422
Training MAPE:  0.20430614
Test MAPE:  0.21458064
Iteration:  30
Training R2:  0.7341292610173598
Test R2:  0.675282447863947
Training MAPE:  0.19884
Test MAPE:  0.2106414
Iteration:  40
Training R2:  0.7543727902943247
Test R2:  0.6911283069660442
Training MAPE:  0.19272944
Test MAPE:  0.20518553
Iteration:  50
Training R2:  0.7704155604360973
Test R2:  0.7018786788203113
Training MAPE:  0.1868625
Test MAPE:  0.2000483
Iteration:  60
Training R2:  0.7844768037517951
Test R2:  0.7113961628721064
Training MAPE:  0.18175963
Test MAPE:  0.19522275
Iteration:  70
Training R2:  0.7962041696395992
Test R2:  0.7203678971950969
Training MAPE:  0.17686298
Test MAPE:  0

KeyboardInterrupt: 