In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Select Data for Modelling

In [4]:
clean_data = "Clean_Salary_Data.csv"
model_data = pd.read_csv(clean_data)
model_data.columns

Index(['Unnamed: 0', 'Age', 'Gender', 'Education Level', 'Job Title',
       'Years of Experience', 'Salary', 'Age Group'],
      dtype='object')

In [5]:
### Selecting The Prediction Target or response variable
y = model_data.Salary

In [6]:
### Choosing "Features" or predictors or predictor variables
features = ['Age', 'Years of Experience']
X = model_data[features]


## Classification or Regression
### Since salary is a continuous variable, predicting it using a classification model such as KNN is not be the best choice. Instead, for predicting continuous variables like salary, we will consider using regression model such as Random Forest Regressor, Decision Tree and Linear Regression. 

## Decision Tree Model

##### Since models' practical value come from making predictions on new data, we measure performance on data that wasn't used to build the model. The most straightforward way to do this is to exclude some data from the model-building process, and then use those to test the model's accuracy on data it hasn't seen before. This data is called validation data.

In [None]:
from sklearn.model_selection import train_test_split

# split data into training and tetsing/validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state= 0)
salary_model = DecisionTreeRegressor()
salary_model.fit(train_X, train_y)

# get predicted salaries on validation data
val_prediction = salary_model.predict(val_X)
print(mean_absolute_error(val_y, val_prediction))
print(mean_squared_error(val_y, val_prediction, squared = False))
print(salary_model.score(val_X, val_y))

18834.51563587788
25638.358627236033
0.7624243769300277


#### The mean absolute error for the in-sample data(before splitting data) was about 15,000 dollars, and accuracy of 83%. Out-of-sample it is more than 18,845 dollars, and accuracy of 76%.

## To overcome Overfitting and Underfitting problem, max_leaf_node

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    salary_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    salary_model.fit(train_X, train_y)
    val_prediction = salary_model.predict(val_X)
    mae = mean_absolute_error(val_y, val_prediction)
    return(mae)

In [None]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [8, 80, 800, 8000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
#     print(f"Max leaf nodes: {max_leaf_nodes}  \t\t Mean Absolute Error:  {my_mae}")

Max leaf nodes: 8  		 Mean Absolute Error:  19792
Max leaf nodes: 80  		 Mean Absolute Error:  18538
Max leaf nodes: 800  		 Mean Absolute Error:  18849
Max leaf nodes: 8000  		 Mean Absolute Error:  18849


### Using a Random Forest Model because of the Decision Tree's fallbacks

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# X = model_data.drop("Salary", axis=1).values
# y = model_data["Salary"].values

# train_X, train_y, val_X, val_y = train_test_split(X, y, test_size = 0.3, random_state= 42)

# categorical_cols = [cname for cname in train_X.columns if train_X[cname].dtype == "object"]

# categorical_cols
forest_model = RandomForestRegressor(random_state = 1)
forest_model.fit(train_X, train_y)

forest_pred = forest_model.predict(val_X)
print(mean_absolute_error(val_y, forest_pred))
print(mean_squared_error(val_y, forest_pred, squared = False))


18452.857811067504
24802.707967354236


In [None]:
print(forest_model.score(val_X, val_y))

0.7776589561141818


#### We can observe that the MAE using DecisionTree was 18827, and the MAE using Random Forest was 18452. A difference close to 400. Accuracy = 77%

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

reg_model = LinearRegression()
reg_model.fit(train_X, train_y)

reg_pred= reg_model.predict(val_X)
print(mean_absolute_error(val_y, reg_pred))

23036.127101861824


In [None]:
print(reg_model.score(val_X, val_y))
print(mean_absolute_error(val_y, reg_pred))
print(mean_squared_error(val_y, reg_pred, squared = False))

0.7098435025632872
23036.127101861824
28333.839515751166


###### Accuracy = 70%

### Cross-Validation for linear reg

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
kf = KFold(n_splits= 6, shuffle= True, random_state= 42)
reg_model = LinearRegression()
cv_results = cross_val_score(reg_model, X, y, cv= kf)
# reported score is R squared
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
# 95% confidence interval
print(np.quantile(cv_results, [0.025, 0.975]))

[0.60547886 0.70841765 0.65889353 0.6584414  0.71095099 0.66926213]
0.6685740942612282 0.03549924044166575
[0.61209918 0.71063432]


### Regularized Regression to handle overfitting

In [None]:
from sklearn.model_selection import Ridge

### Quick Supervised from GPT - To be read and reviewed 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Assume you have a DataFrame 'model_data' with the specified features and the target variable 'Salary'

# Separate features and target variable
X = model_data[["Age", "Gender", "Years of Experience", "Education Level", "Job Title"]]
y = model_data["Salary"]

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify categorical columns
categorical_cols = ["Gender", "Education Level", "Job Title"]

# Preprocessing for numerical data
numerical_cols = ["Age", "Years of Experience"]

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Define the models
models = [
    ("Random Forest", RandomForestRegressor(random_state=1)),
    ("Gradient Boosting", GradientBoostingRegressor(random_state=1)),
    ("Linear Regression", LinearRegression()),
    ("Support Vector Regression", SVR(kernel="linear")),
    # Add more models as needed
]

# Train and evaluate each model
for model_name, model in models:
    # Create the model pipeline
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    model_pipeline.fit(train_X, train_y)

    # Evaluate the model
    mse = mean_squared_error(val_y, model_pipeline.predict(val_X), squared= False)
    print(f"{model_name} - Mean Squared Error: {mse}")


Random Forest - Mean Squared Error: 16707.24190263947
Gradient Boosting - Mean Squared Error: 18145.477537377123
Linear Regression - Mean Squared Error: 21422.44886478472
Support Vector Regression - Mean Squared Error: 32482.074817681434
