In [128]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Assignment 2 - Regression


### Load Data

In [129]:
# Load Data
df = pd.read_csv("data/Euro_Salary.csv")

# Create target variable of total compensation
df["Bonus"] = pd.to_numeric(df["Yearly bonus + stocks in EUR"], downcast="float", errors="coerce")
df["Bonus"].fillna(0, inplace=True)
df["target"] = df["Yearly brutto salary (without bonus and stocks) in EUR"] + df["Bonus"]
df.drop(columns={"Timestamp","Yearly brutto salary (without bonus and stocks) in EUR", "Yearly bonus + stocks in EUR", "Bonus"}, inplace=True)

# Remove rows with more than 7 NaN values
NaN_threshold = 7
df = df[df.isnull().sum(axis=1) <= NaN_threshold]

### Data Cleaning/Prep

In [130]:
# Finding the mode for "Gender" and "Company size" and using that to fill NaN values in those columns
gender_mode = df['Gender'].mode()[0] 
df['Gender'].fillna(gender_mode, inplace=True)

company_mode = df['Company size'].mode()[0] 
df['Company size'].fillna(company_mode, inplace=True)

# Replacing low frequency values with "Other"
def replace_low_freq(d, col, threshold=10, replacement='other'):
    value_counts = d[col].value_counts() # Specific column 
    to_remove = value_counts[value_counts <= threshold].index
    tmp = d[col].replace(to_replace=to_remove, value=replacement)
    return tmp

# Using low frequency function on each categorical column, then filling NaN values with "Other"
df["Seniority level"] = replace_low_freq(df, "Seniority level", 5, "Other")
df["Seniority level"].fillna("Other", inplace=True)

df["City"] = replace_low_freq(df, "City", 10, "Other")
df["City"].fillna("Other", inplace=True)

df["Your main technology / programming language"] = replace_low_freq(df, "Your main technology / programming language", 10, "Other")
df["Your main technology / programming language"].fillna("Other", inplace=True)

df["Other technologies/programming languages you use often"] = replace_low_freq(df, "Other technologies/programming languages you use often", 10, "Other")
df["Other technologies/programming languages you use often"].fillna("Other", inplace=True)

df["Position "] = replace_low_freq(df, "Position ", 10, "Other")
df["Position "].fillna("Other", inplace=True)

df["Company type"] = replace_low_freq(df, "Company type", 10, "Other")
df["Company type"].fillna("Other", inplace=True)

df["Main language at work"] = replace_low_freq(df, "Main language at work", 10, "Other")
df["Main language at work"].fillna("Other", inplace=True)

df["Employment status"] = replace_low_freq(df, "Employment status", 3, "Other")
df["Employment status"].fillna("Other", inplace=True)

df["Contract duration"] = replace_low_freq(df, "Contract duration", 3, "Other")
df["Contract duration"].fillna("Other", inplace=True)

In [131]:
# Using an imputer to take the mean of the numerical columns for NaN values
mean_columns = ["Age", "Total years of experience", "Years of experience in Germany", "Number of vacation days"]
imputer = SimpleImputer(strategy='mean')

# Replacing non-numerical symbols in data
replacement = {",":".", "<":" "}
df['Total years of experience'] = df['Total years of experience'].replace(replacement, regex=True)
df['Years of experience in Germany'] = df['Years of experience in Germany'].replace(replacement, regex=True)

# Converting column values to numerical to remove words
df['Total years of experience'] = pd.to_numeric(df['Total years of experience'], errors='coerce')
df['Years of experience in Germany'] = pd.to_numeric(df['Years of experience in Germany'], errors='coerce')
df['Number of vacation days'] = pd.to_numeric(df['Number of vacation days'], errors='coerce')

# Calculate and apply mean of column
df[mean_columns] = imputer.fit_transform(df[mean_columns])

In [132]:
# Ordinal encoding for "Seniority level"
seniority_order = {'Other': 0, 'Junior': 1, 'Middle': 2, 'Senior': 3, 'Lead': 4, 'Head': 5}
df['Seniority level'] = df['Seniority level'].map(seniority_order)

### Feature Selection

In [133]:
# Feature selection using Variance Threshold
df_vf = pd.get_dummies(df, drop_first=True)
y_vf = df_vf["target"]
X_vf = df_vf.drop(columns={"target"})

var_th = VarianceThreshold(.2)

post_vt = var_th.fit_transform(X_vf)
print(post_vt.shape)

mask = var_th.get_support()
new_features = X_vf.columns[mask]
print(new_features)

(1247, 9)
Index(['Age', 'Total years of experience', 'Years of experience in Germany',
       'Seniority level', 'Number of vacation days',
       'Position _Software Engineer',
       'Your main technology / programming language_Other',
       'Company size_101-1000', 'Company type_Product'],
      dtype='object')


In [134]:
# New data frame after results of variance threshold feature selection
df_fs = df.copy()
df_fs.drop(columns={"Gender", "Other technologies/programming languages you use often", "Employment status", "Main language at work", "Contract duration"}, inplace=True)

### Model Testing

In [141]:
# Set up data for model testing
df1 = pd.get_dummies(df, drop_first=True)

y = np.array(df1["target"]).reshape(-1,1)
X = np.array(df1.drop(columns={"target"}))

xTrain,xTest,yTrain,yTest = train_test_split(X,y,test_size=.3)

In [142]:
# Set up data for model testing after feature selection
df_new = pd.get_dummies(df_fs, drop_first=True)

y_fs = np.array(df_new["target"]).reshape(-1,1)
X_fs = np.array(df_new.drop(columns={"target"}))

xTrain_fs,xTest_fs,yTrain_fs,yTest_fs = train_test_split(X_fs,y_fs,test_size=.3)

In [143]:
#Regression Tree Model

scalar = MinMaxScaler()
regression_tree = DecisionTreeRegressor()
pipe = Pipeline(steps=[('scalar', scalar), ("regression_tree", regression_tree)])

tree_para = {'regression_tree__min_samples_leaf':[4,5,6,7,8],
            'regression_tree__min_samples_split':[4,5,6,7,8],
            'regression_tree__max_depth':[2,3,4,5,6,7],
            'regression_tree__criterion':["friedman_mse", "poisson", "squared_error", "absolute_error"]}

model = GridSearchCV(pipe, param_grid=tree_para, cv=5, n_jobs=-1)
model.fit(xTrain, yTrain)
model_tr = model.best_estimator_


print(model_tr)

print("Train Score with no feature selection:", model.score(xTrain, yTrain))
model_preds = model.predict(xTest)
print("RMSE with no feature selection:", mean_squared_error(yTest,model_preds,squared=False))
print("R2 with no feature selection:", np.mean(cross_val_score(model_tr, xTrain, yTrain.ravel(), cv=5)))



Pipeline(steps=[('scalar', MinMaxScaler()),
                ('regression_tree',
                 DecisionTreeRegressor(criterion='absolute_error', max_depth=3,
                                       min_samples_leaf=7,
                                       min_samples_split=4))])
Train Score with no feature selection: -0.0011589153013702447
RMSE with no feature selection: 38155.93185325165
R2 with no feature selection: 0.09263588866262706


In [144]:
#Regression Tree Model from data after feature selection

scalar_fs = MinMaxScaler()
regression_tree_fs = DecisionTreeRegressor()
pipe_fs = Pipeline(steps=[('scalar', scalar_fs), ("regression_tree", regression_tree_fs)])

tree_para_fs = {'regression_tree__min_samples_leaf':[4,5,6,7,8],
            'regression_tree__min_samples_split':[4,5,6,7,8],
            'regression_tree__max_depth':[2,3,4,5,6,7],
            'regression_tree__criterion':["friedman_mse", "poisson", "squared_error", "absolute_error"]}

model_fs = GridSearchCV(pipe_fs, param_grid=tree_para_fs, cv=5, n_jobs=-1)
model_fs.fit(xTrain_fs, yTrain_fs)
model_tr_fs= model_fs.best_estimator_


print(model_tr)

print("Train Score after feature selection:", model_fs.score(xTrain_fs, yTrain_fs))
model_preds_fs = model_fs.predict(xTest_fs)
print("RMSE after feature selection:", mean_squared_error(yTest_fs,model_preds_fs,squared=False))
print("R2 after feature selection:", np.mean(cross_val_score(model_tr_fs, xTrain_fs, yTrain_fs.ravel(), cv=5)))


Pipeline(steps=[('scalar', MinMaxScaler()),
                ('regression_tree',
                 DecisionTreeRegressor(criterion='absolute_error', max_depth=3,
                                       min_samples_leaf=7,
                                       min_samples_split=4))])
Train Score after feature selection: 0.05729711774014179
RMSE after feature selection: 284015832.4165602
R2 after feature selection: 0.05522113574581004


# Answers and Explanations

### Results
- For my modelling I used a Regression Tree model with a MinMaxScalar, after using a OneHotEncoder to encode the categorical values through get_dummies. 
- The results of the model were pretty low for accuracy when using the intial data without any feature selection. For my intitial test R2 was 0.092 and RMSE was 38155.93. After using feature selection the results were an R2 of 0.055 and RMSE of 284015832.41. While the accuracy decreased slightly there was an increase in the errors, indicatng that the extra column may have been useful to the accuracy of the model, however the accuracy is still very low. Overall the issue of low accuracy and high errors may result from the way the data was imputed or cleaned initially, or perhaps a different and more complex model should be used.

### Feature Selection Activities
- For feature selection I used two methods: Inspection and Variance Threshold. 
- Through an initial inspection of the data and how it relates to the real situation, it makes sense that columns such as "Gender", "Contract duration", "Main language at work", and "Other technologies/programming languages you use often" would not really affect someones salary or bonuses in a job environment. The important columns for the model would definitely be relating to position, years of experience, and seniority level as these often affect someone's salary.
- By using variance threshold, the columns that had low variance and therefore could be removed were "Gender", "Other technologies/programming languages you use often", "Employment status", "Main language at work", and "Contract duration".
- Combining the results of both these feature selections lead to my new dataframe used for the second set of modelling.

### Hyperparameter Changes
- To reduce overfitting on the model I adjusted a few hyperparameters with the regression tree model, using a grid search to find the best combination. The parameters I used were min samples leaf and split, max depth, and criterion. The results found that having a lower depth and samples, as well as using absolute error would produce the best results for the model.