In [1]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
columns = [
    "EdLevel",
    "Country",
    "AIBen",
    "WorkExp",
    "PurchaseInfluence",
    "YearsCode",
    "YearsCodePro",
    "Industry",
    "DevType",
    "OrgSize",
    "RemoteWork",
    "ConvertedCompYearly"
]

In [3]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [4]:
df1 = pd.read_csv("Datasets/survey_results_public.csv")

df1 = df1[df1["ConvertedCompYearly"].notnull()]
df1 = df1[columns]
df1 = df1.dropna()
country_map = shorten_categories(df1.Country.value_counts(), 400)
df1['Country'] = df1['Country'].map(country_map)
# currency_map = shorten_categories(df1.Currency.value_counts(), 400)
# df1["Currency"] = df1["Currency"].map(currency_map)

In [79]:
max(df1["ConvertedCompYearly"])

74351432.0

In [68]:
df1 = df1[df1["ConvertedCompYearly"] <= 250000]
df1 = df1[df1["ConvertedCompYearly"] >= 100]
df1 = df1[df1['Country'] != 'Other']

In [8]:
df1.to_csv("Filtered Dataset.csv")

In [5]:
df_LE = df1
for i in columns:
    if i == "ConvertedCompYearly":
        continue
    le = LabelEncoder()
    df_LE[i] = le.fit_transform(df_LE[i])

In [6]:
X = df_LE[[
    "EdLevel",
    "Country",
    "AIBen",
    "WorkExp",
    "PurchaseInfluence",
    "YearsCode",
    "YearsCodePro",
    "Industry",
    "DevType",
    "OrgSize",
    "RemoteWork"
]]

Y = df_LE["ConvertedCompYearly"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

In [83]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
X_train_scaled = X_train
X_test_scaled = X_test

In [11]:
classifiers = {}

# Linear Regression
classifiers["LinearRegression"] = LinearRegression()

# Decision Tree
classifiers["DecisionTree"] = DecisionTreeRegressor()

# Random Forest
classifiers["RandomForest"] = RandomForestRegressor()

# XGBoost
classifiers["XGBoost"] = XGBRegressor()

# # LightGBM
classifiers["LightGBM"] = LGBMRegressor()

# # CatBoost
# classifiers["CatBoost"] = CatBoostRegressor(silent=True)

In [None]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)


In [18]:
df_LE["ConvertedCompYearly"].value_counts()

ConvertedCompYearly
2161    212
1871    167
3501    163
2416    155
2635    152
       ... 
672       1
3726      1
1401      1
3698      1
858       1
Name: count, Length: 3755, dtype: int64

In [86]:
max(df_LE["ConvertedCompYearly"])

74351432.0