In [93]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [148]:
#READ THE DATA FILE
df1 = pd.read_csv("Datasets/survey_results_public.csv")

In [None]:
#ADD COLUMNS HERE

In [149]:
 #CHOOSE COLUMNS HEREE
columns = [
    "EdLevel",
    "RemoteWork",
    "YearsCode",
    "YearsCodePro",
    "DevType",
    "OrgSize",
    "PurchaseInfluence",
    "TechList",
    "Country",
    "Currency",
    "AISelect",
    "AISent",
    "WorkExp",
    "Industry",
     "ConvertedCompYearly",
     "Employment"
]

In [150]:
df1 = df1[columns]

In [151]:
#DROP RECORDS WITH SALARY NONE
df1 = df1[df1["ConvertedCompYearly"].notnull()]
#FILL NA WITH 0
df1.fillna(0, inplace=True)

In [152]:
#CATEGORISE COLUMNS  INTO MAJORITY VALUES AND 'OTHER'
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map


country_map = shorten_categories(df1.Country.value_counts(), 400)
df1['Country'] = df1['Country'].map(country_map)
currency_map = shorten_categories(df1.Currency.value_counts(), 400)
df1['Currency'] = df1['Currency'].map(currency_map)

In [153]:
#CATEGORISE THE WORK EXPERIENCE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1['ExperienceCategory'] = pd.cut(df1['WorkExp'], bins=bins, labels=labels)

In [154]:
#CATEGORISE LESS THAN 1 YEAR AS 0 AND MORE THAN 50 AS 51 FOR YEARS OF CODE
df1['YearsCode'] = df1['YearsCode'].replace("Less than 1 year", 0)
df1['YearsCode'] = df1['YearsCode'].replace("More than 50 years", 51)

df1['YearsCodePro'] = df1['YearsCodePro'].replace("Less than 1 year", 0)
df1['YearsCodePro'] = df1['YearsCodePro'].replace("More than 50 years", 51)

In [155]:
#CATEGORISE YEARS OF CODE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1["YearsCode"] = df1["YearsCode"].astype(int)
df1["YearsCodePro"] = df1["YearsCodePro"].astype(int)
df1['YearsCodeCategory'] = pd.cut(df1['YearsCode'], bins=bins, labels=labels)
df1['YearsCodeProCategory'] = pd.cut(df1['YearsCodePro'], bins=bins, labels=labels)

In [None]:
#FILTER THE DATAFRAME
df1 = df1[df1["ConvertedCompYearly"] <= 250000]
df1 = df1[df1["ConvertedCompYearly"] >= 100]
df1 = df1[df1['Country'] != 'Other']
df1 = df1[df1["Currency"] != "Other"]

In [156]:
#MAKE COLUMNS STRING TYPE
for x in columns:
    df1[x] = df1[x].astype(str)

In [126]:
len(df1)

48019

In [157]:
#LABEL ENCODE THE COLUMNS
df_LE = df1
for i in columns:
    if i == "ConvertedCompYearly":
        continue
    le = LabelEncoder()
    df_LE[i] = le.fit_transform(df_LE[i])
    
X = df_LE[[
#     "Age",
 'AISelect',
 'OrgSize',
 'DevType',
 'YearsCodeCategory',
 'Industry',
 'ExperienceCategory', 
 'YearsCodeProCategory',
    "RemoteWork",
 'Currency',
 'Country',
    "EdLevel",
    "Employment"
]]

Y = df_LE["ConvertedCompYearly"]

In [158]:
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  result = np.asarray(self, dtype=dtype)
  result = np.asarray(self, dtype=dtype)
  result = np.asarray(self, dtype=dtype)
  result = np.asarray(self, dtype=dtype)
  result = np.asarray(self, dtype=dtype)


In [159]:
#INITIALISE MODELS
classifiers = {}

# Linear Regression
classifiers["LinearRegression"] = LinearRegression()

# Decision Tree
classifiers["DecisionTree"] = DecisionTreeRegressor()

# Random Forest
classifiers["RandomForest"] = RandomForestRegressor()

# XGBoost
classifiers["XGBoost"] = XGBRegressor()

# # LightGBM
# classifiers["LightGBM"] = LGBMRegressor()

# # CatBoost
# classifiers["CatBoost"] = CatBoostRegressor(silent=True)

In [160]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)

              Model  Run Time (minutes)           MAE           MSE  \
0  LinearRegression                0.00  57205.388140  1.015713e+10   
2      RandomForest                0.14  49007.680878  2.114488e+10   
1      DecisionTree                0.00  59618.419746  1.103938e+11   
3           XGBoost                0.01  53424.661693  1.145923e+11   

            RMSE        R2  
0  100782.605007  0.040434  
2  145412.781439 -0.997601  
1  332255.616603 -9.429136  
3  338514.787000 -9.825774  
