# Salary Prediction Code

## Imports

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import pickle

# Visualization
import matplotlib.pyplot as plt

# Scikit-Learn Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Scikit-Learn Utilities
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Scikit-Learn Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Functions

In [2]:
# Simplifica la impresión de los números
def format_number(num):
    if num >= 1_000_000:
        return f"{num / 1_000_000:.1f}M"
    elif num >= 1_000:
        return f"{num / 1_000:.1f}K"
    else:
        return str(num)

# Nos ayuda a deshacernos de aquellos países con pocos datos
def cut_cat(category_data, threshold):
    category_mapping = {}
    for idx in range(len(category_data)):
        if category_data.values[idx] >= threshold:
            category_mapping[category_data.index[idx]] = category_data.index[idx]
        else:  # if insufficient data, categorize as "Other"
            category_mapping[category_data.index[idx]] = 'Other'
    return category_mapping

# Simplifica las strings
def short_ed(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

## Reading Dataset

In [None]:
df = pd.read_csv("survey_results_public.csv")
df.head()

We want to keep the independet variables that will help us train our model.

In [None]:
keep = ["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedCompYearly"]
df = df[keep]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
df.head()

## Data cleaning

We will delete those rows where the salary was not input.

In [None]:
df = df.dropna(subset=['Salary'])
df.head()

In [None]:
df.isnull().sum()

We still need to get rid of some rows that are missing values.

Another approach is to fill in those missing values with the mean of the values.

In [None]:
df = df.dropna()
df.isnull().sum()

We are just going to work with those **full-time** employees.

In [None]:
df = df[df["Employment"].str.contains("full-time", case=False, na=False)]

df = df.drop("Employment", axis=1) # we don't need thise column for the prediction
df.info()

For some of the countries we only have 1 data point, which is irrelevant.

In [None]:
df['Country'].value_counts()

In [10]:
country_map = cut_cat(df.Country.value_counts(), 400)

In [None]:
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 7))
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [14]:
df = df[df["Salary"] <= 250000] # in our plot 0.2
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other'] # we will not keep other

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 7))
df.boxplot('Salary', 'Country', ax=ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()

In [None]:
df["YearsCodePro"].unique()

In order to deal with numbers, we will get rid of those strings elements.

```python
if "string x":
    return X
else:
    return X
```

In [17]:
df['YearsCodePro'] = df['YearsCodePro'].apply(lambda x: 0.5 if x == 'Less than 1 year' else float(x))


We do the same thing with "EdLevel" column.

In [None]:
df["EdLevel"].unique()

In [19]:
df['EdLevel'] = df['EdLevel'].apply(short_ed)

In [None]:
df["EdLevel"].unique()

#### STREAMLIT HELP

In [None]:
ed_tuple = tuple(df["EdLevel"].unique())

for ed in ed_tuple:
    print(ed)

In [None]:
country_tuple = tuple(df["Country"].unique())

for country in country_tuple:
    print(country)

## Transform data

Notice that we are dealing with strings & numbers, and computers understand binary code, therefore we will need to transform those strings columns to number.

In [None]:
le_ed = LabelEncoder()
df['EdLevel'] = le_ed.fit_transform(df['EdLevel'])
df["EdLevel"].unique()

In [None]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df["Country"].unique()

## Model Training

We need to split the model in features and the target.

In [24]:
X = df.drop("Salary", axis=1) #feature
y = df["Salary"] #target 

### Regression problem

We are not predidcting a defined category cat/dog, we are predicting numbers | Not classifying

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

In [26]:
y_pred = linear_reg.predict(X)

Noticed that our range is from 10K to 2.500K

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print(f"We are getting around -> ${format_number(error)}")

### Decision Tree

In [None]:
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

In [29]:
y_pred = dec_tree_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print(f"We are getting around -> ${format_number(error)}")

### Random Forest

In [None]:
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)

In [32]:
y_pred = random_forest_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y, y_pred))
print(f"We are getting around -> ${format_number(error)}")

**GRIDSEARCH**

We can use GridSearchCV to find the best parameters for our models

In [None]:
max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)

After doing so, we are going to test it again.

In [None]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print(f"We are getting around -> ${format_number(error)}")

In [None]:
X

Imagine we are getting a new row with the following characteristics (yo)

In [None]:
# country, edlevel, yearscode
X = np.array([["Spain", 'Bachelor’s degree', 1 ]])
X

We apply the label enconder for contry & education level

In [None]:
X[:, 0] = le_country.transform(X[:,0])
X[:, 1] = le_ed.transform(X[:,1])
X = X.astype(float) #float aarray
X

In [None]:
y_pred = regressor.predict(X)
print(f"The predicted salary is ${format_number(y_pred[0])}")

## Exporting our model

Saving model

In [40]:
data = {"model": regressor, "le_country": le_country, "le_education": le_ed}
with open('salary_pred_model.pkl', 'wb') as file:
    pickle.dump(data, file)

Open model

In [41]:
with open('salary_pred_model.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_country = data["le_country"]
le_ed = data["le_education"]

Let's run the prediction with our loaded model. We should get the save value

In [None]:
y_pred2 = regressor_loaded.predict(X)
if y_pred == y_pred2:
    print("We are getting the same value.")
    print(f"${format_number(y_pred[0])}") # 38.8K
else:
    print("Different value this time.")
    print(f"The new predicted salary is ${format_number(y_pred2[0])}")