# Build a Random Forest Regression Model

Import libraries

In [21]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error


## Read the data

Read in the .csv file.  This .csv file has been reduced to only the necessary columns.  
  
Look at the data shape to determine the number of rows and columns.

In [25]:
#  import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/mikecolbert/salary_prediction_model/refs/heads/main/2022_kaggle_survey_results_public.csv')

print(data.shape) # rows, columns

data.head() # look at the first few rows

HTTPError: HTTP Error 404: Not Found

<br>

## Clean the data

How many cells are NULL in each column?

In [30]:
data.isnull().sum() # check for missing values in each column

NameError: name 'data' is not defined

What proportion of the data contains one or more missing values (NULL) in the row?

In [33]:
(data.shape[0] - data.dropna().shape[0])/data.shape[0] # proportion of rows with missing values
#24% of the rows have missing values

NameError: name 'data' is not defined

We could imputate the missing values with the mean of the column but we are not focused on tuning so we will just drop the rows with missing values.

In [None]:
# drop rows with missing values (NULLs)
data.dropna(inplace=True)

How has the data shape changed from the original?  
rows, columns  

In [None]:
data.shape  # rows, columns

Look at the first few rows of the data frame.

In [None]:
data.head()

<br>

## Prep the variables to use in our model

Split the data into x and y data frames separating independent and dependent variables for prediction.

In [None]:
x = data.loc[:,['age','gender','country','highest_deg','code_experience','current_title', 'company_size']]
y = data.loc[:,['annual_comp']]

Review the first few rows of each data frame.

In [None]:
y.head()

In [None]:
x.head()

In the x data frame, map the categorical variables to numerical values for use in the regression model.

In [None]:
# perform label encoding since we have categorical variables
# from sklearn.preprocessing import LabelEncoder

# List of columns you want to encode
columns_to_encode = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size",
]

# Loop through each column and apply LabelEncoder individually
for column in columns_to_encode:
    # Create a new instance of LabelEncoder for each column
    le = LabelEncoder()

    # Fit and transform the column and assign it back to the DataFrame
    x[column +'_enc'] = le.fit_transform(x[column])

You should now see both the categorical and encoded columns (_enc) in the data frame.

In [None]:
x.head()

Print a map of the categorical values and the corresponding encoded values.  Copy the returned data and paste it somewhere you can refer back to it frequently.

In [None]:
# make a map of the categorical values and their corresponding encoded values
# from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

columns_to_encode = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size",
]

# Loop through each column and apply LabelEncoder individually
for column in columns_to_encode:
    # Fit the encoder to the column
    le.fit(x[column])

    # Create a mapping dictionary from label to encoded value using the encoder's classes_
    mapping = {
        label: code for code, label in zip(le.classes_, range(len(le.classes_)))
    }

    # Display the mapping
    print(column, "mapping:", mapping)

Review the data frame again. We must drop the non-encoded columns.

In [None]:
x.head()

Drop the categorical columns. Rename the encoded columns.

In [None]:

# drop the categroical columns
x.drop('age',inplace=True,axis=1) 
x.drop("gender", inplace=True, axis=1)  
x.drop("country", inplace=True, axis=1)
x.drop("highest_deg", inplace=True, axis=1)  
x.drop("code_experience", inplace=True, axis=1) 
x.drop("current_title", inplace=True, axis=1) 
x.drop("company_size", inplace=True, axis=1)

# rename the _enc columns to the original column names
x.rename(columns={"age_enc": "age"}, inplace=True)
x.rename(columns={"gender_enc": "gender"}, inplace=True) 
x.rename(columns={"country_enc": "country"}, inplace=True)  
x.rename(columns={"highest_deg_enc": "highest_deg"}, inplace=True)
x.rename(columns={"code_experience_enc": "code_experience"}, inplace=True)
x.rename(columns={"current_title_enc": "current_title"}, inplace=True)
x.rename(columns={"company_size_enc": "company_size"}, inplace=True)


Review the x data frame again to ensure dropping and renaming happened correctly.

In [None]:
x.head()

Review the y data frame. Notice the y data is also categorical.

In [None]:
y.head()

What are the unique categorical values in the annual_comp column?

In [None]:
y.annual_comp.unique()

<br>

### <font color="crimson"> This step is super-sketchy. </font> 

I did this so the model would return a specific predicted salary value rather than a categorical salary range.

I create a random integer between the low and high values in the categorical salary range.

In [None]:
# import pandas as pd'
# import random

# function to calculate the midpoint of a salary range
def calculate_midpoint(salary_range):
    if salary_range == ">$1,000,000":
        # Handle this case as needed; here I'm assuming $1,000,000 as a placeholder
        return 1000000
    else:
        # Remove any commas and dollar signs, then split by '-'
        low, high = salary_range.replace(",", "").replace("$", "").split("-")
        # Calculate and return the midpoint
        # midpoint = round((int(low) + int(high)) / 2)
        # return (int(low) + midpoint)
        return round(random.randint(int(low), int(high)))

# Apply the function to the 'annual_comp' column
y["annual_comp"] = y["annual_comp"].apply(calculate_midpoint)

# Display the updated DataFrame
print(y)

Review the data frame. Have the categorical ranges been replaced by random values?

In [None]:
y.head()

<br>

## Build the machine learning model

Split the x and y data frames into training and testing data for the model.

In [None]:
# create a train test split
# from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
x_test.shape

Train a random forest regression model.

I'm not a machine learning model person. I'm not entirely sure what n_estimators and max_depth are doing in this part of the code.

In [None]:
# train a random forest regressor model
# from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=70, max_depth=7)
model.fit(x_train, y_train.values.ravel())  # if y_train is a dataframe
y_predict = model.predict(x_test)

<br>

## Evaluate the model

Get the mean absolute error and r2 score for the model. 

If you are a machine learning person, continue to tune the model to get the best possible scores.

In [None]:
# from sklearn.metrics import mean_absolute_error, r2_score
print("MAE : ", mean_absolute_error(y_test,y_predict))
print("r2 score : ", r2_score(y_test,y_predict))

<br>

## Export the model

When you are happy with your model performance, dump it out to a file.

In [None]:
# save the model
# import joblib
joblib.dump(model,'salary_predict_model.ml')

<br>

## Test the exported model

To test your model, load the model file you created in the step above.

In [None]:
# load the model we just created
# import joblib
model = joblib.load('salary_predict_model.ml')

Then run it, passing in dependent variables. These are the mapped numerical labels of the categorical values. You can get the numeric values by looking at the mapping you copied and pasted out to a file a few steps earlier.

To predict a salary, you must pass in the values in the correct order (age,gender,country,highest_deg,coding_exp, title, company_size)

7: '50-54'  
0: 'Man'  
55: 'United States of America'  
3: 'Master’s degree'  
4: '5-10 years'  
13: 'Teacher / professor'  
2: '1000-9,999 employees'  

The returned value will be different for everyone, because the salary values are randomly generated.

In [None]:
model.predict([[7,0,55,3,4,13,2]]) # predict the salary for a new data point

# generates a warning because we're passing just a plain list, not a pandas dataframe with 
# the same feature names the model was trained on


In [None]:
# to get rid of the warning, we can generate a dataframe with the correct column names

input_data = pd.DataFrame([[7, 0, 55, 3, 4, 13, 2]],
    columns=["age", "gender", "country", "highest_deg", "code_experience", "current_title", "company_size"])


In [None]:
print(input_data)

In [None]:
prediction = model.predict(input_data)
print(f"predicted salary: ${prediction[0]:,.2f}")