In [30]:
#IMPORT MODULES
import pandas as pd
import numpy as np
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [31]:
df = pd.read_csv("Updated Dataset.csv")

In [32]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [33]:
country_map = shorten_categories(df.Country.value_counts(), 400)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

Country
Other    9626
177      8005
59       2682
175      2305
72       1524
30       1422
56       1074
131       918
23        915
115       837
8         768
154       764
79        712
159       632
160       431
Name: count, dtype: int64

In [34]:
df = df[df["Salary"] <= 250000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

In [35]:
X = df[[
    "Age",
    "RemoteWork",
    "EdLevel",
    "YearsCode",
    "Country",
    "WorkExp",
    "Industry"
]]

In [36]:
Y = df['Salary']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [38]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
classifiers = {}

# Linear Regression
classifiers["LinearRegression"] = LinearRegression()

# # Decision Tree
classifiers["DecisionTree"] = DecisionTreeRegressor()

# # Random Forest
classifiers["RandomForest"] = RandomForestRegressor()

# # XGBoost
# classifiers["XGBoost"] = XGBRegressor()

# # LightGBM
# classifiers["LightGBM"] = LGBMRegressor()

# # CatBoost
# classifiers["CatBoost"] = CatBoostRegressor(silent=True)

In [40]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)

              Model  Run Time (minutes)           MAE           MSE  \
2      RandomForest                0.04  31306.268119  1.802707e+09   
0  LinearRegression                0.00  41958.498763  2.719233e+09   
1      DecisionTree                0.00  39786.995706  2.983569e+09   

           RMSE        R2  
2  42458.302305  0.435426  
0  52146.264453  0.148387  
1  54622.055706  0.065602  


Country
Other    9626
177      8005
59       2682
175      2305
72       1524
30       1422
56       1074
131       918
23        915
115       837
8         768
154       764
79        712
159       632
160       431
Name: count, dtype: int64

In [12]:
df['RemoteWork'].value_counts()

RemoteWork
2    14199
0    13863
1     4527
3       26
Name: count, dtype: int64