In [47]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [48]:
#READ THE DATA FILE
df1 = pd.read_csv("../Datasets/survey_results_public.csv")

In [49]:
#ADD COLUMNS HERE
import pandas as pd
df1 = pd.read_csv("../Datasets/survey_results_public.csv")
column_name = "Employment"
st = set()
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        st.add(ele)
for ele in st:
    df1[ele] = 0
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        df1.loc[i,ele] = 1

In [None]:
#CHOOSE COLUMNS TO TRAIN ON

train_columns = [
#     "Age",
 'AISelect',
 'OrgSize',
 'DevType',
 'YearsCodeCategory',
 'Industry',
 'ExperienceCategory', 
 'YearsCodeProCategory',
    "RemoteWork",
 'Currency',
 'Country',
    "EdLevel"
]

for ele in st:
    train_columns.append(ele)

In [50]:
#DROP RECORDS WITH SALARY NONE
df1 = df1[df1["ConvertedCompYearly"].notnull()]
#FILL NA WITH 0
df1.fillna(0, inplace=True)

In [51]:
#CATEGORISE COLUMNS  INTO MAJORITY VALUES AND 'OTHER'
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map


country_map = shorten_categories(df1.Country.value_counts(), 400)
df1['Country'] = df1['Country'].map(country_map)
currency_map = shorten_categories(df1.Currency.value_counts(), 400)
df1['Currency'] = df1['Currency'].map(currency_map)

In [52]:
#CATEGORISE THE WORK EXPERIENCE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1['ExperienceCategory'] = pd.cut(df1['WorkExp'], bins=bins, labels=labels)

In [53]:
#CATEGORISE LESS THAN 1 YEAR AS 0 AND MORE THAN 50 AS 51 FOR YEARS OF CODE
df1['YearsCode'] = df1['YearsCode'].replace("Less than 1 year", 0)
df1['YearsCode'] = df1['YearsCode'].replace("More than 50 years", 51)

df1['YearsCodePro'] = df1['YearsCodePro'].replace("Less than 1 year", 0)
df1['YearsCodePro'] = df1['YearsCodePro'].replace("More than 50 years", 51)

In [54]:
#CATEGORISE YEARS OF CODE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1["YearsCode"] = df1["YearsCode"].astype(int)
df1["YearsCodePro"] = df1["YearsCodePro"].astype(int)
df1['YearsCodeCategory'] = pd.cut(df1['YearsCode'], bins=bins, labels=labels)
df1['YearsCodeProCategory'] = pd.cut(df1['YearsCodePro'], bins=bins, labels=labels)

In [55]:
#FILTER THE DATAFRAME
df1 = df1[df1["ConvertedCompYearly"] <= 250000]
df1 = df1[df1["ConvertedCompYearly"] >= 100]
df1 = df1[df1['Country'] != 'Other']
df1 = df1[df1["Currency"] != "Other"]

In [56]:
#MAKE COLUMNS STRING TYPE
for x in df1.columns:
    df1[x] = df1[x].astype(str)

In [57]:
len(df1)

37524

In [59]:
#LABEL ENCODE THE COLUMNS

df_LE = df1
for i in train_columns:
    if i == "ConvertedCompYearly":
        continue
    le = LabelEncoder()
    df_LE[i] = le.fit_transform(df_LE[i])

X = df_LE[train_columns]

Y = df_LE["ConvertedCompYearly"]

In [60]:
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [61]:
#INITIALISE MODELS
classifiers = {}

# Linear Regression
classifiers["LinearRegression"] = LinearRegression()

# Decision Tree
classifiers["DecisionTree"] = DecisionTreeRegressor()

# Random Forest
classifiers["RandomForest"] = RandomForestRegressor()

# XGBoost
classifiers["XGBoost"] = XGBRegressor()

# # LightGBM
# classifiers["LightGBM"] = LGBMRegressor()

# # CatBoost
# classifiers["CatBoost"] = CatBoostRegressor(silent=True)

In [62]:
#CHECK IMPORTANCE
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, Y)
feature_importances = model.feature_importances_

map_ = {}

for i, a in enumerate(X):
    map_[a] = feature_importances[i]
    
sorted(map_.items(), key = lambda x: x[1])

[('Student, part-time', 0.0),
 ('Not employed, and not looking for work', 0.0),
 ('Student, full-time', 0.0),
 ('Not employed, but looking for work', 0.0),
 ('I prefer not to say', 0.0002527982921425521),
 ('Retired', 0.0006639597253675796),
 ('Employed, part-time', 0.00466642001322175),
 ('Employed, full-time', 0.009485332904982033),
 ('Independent contractor, freelancer, or self-employed',
  0.014389194313680333),
 ('RemoteWork', 0.028814848729277188),
 ('ExperienceCategory', 0.03259935873628199),
 ('AISelect', 0.03680907764414173),
 ('EdLevel', 0.0502087461176938),
 ('Industry', 0.050614633781622204),
 ('Currency', 0.058437800427732285),
 ('OrgSize', 0.07202430679353979),
 ('YearsCodeProCategory', 0.07776433958387495),
 ('YearsCodeCategory', 0.08553318417193767),
 ('DevType', 0.08641588787596331),
 ('Country', 0.3913201108885409)]

In [63]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)

  df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)


              Model  Run Time (minutes)           MAE           MSE  \
3           XGBoost                0.00  24249.255035  1.140693e+09   
2      RandomForest                0.21  26320.208980  1.338425e+09   
0  LinearRegression                0.00  37228.174634  2.226287e+09   
1      DecisionTree                0.00  36291.107603  2.534464e+09   

           RMSE        R2  
3  33774.146514  0.634366  
2  36584.485147  0.570986  
0  47183.544996  0.286394  
1  50343.461965  0.187612  


  if is_sparse(data):
