In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

# Load data
df = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\Project1_SalaryPrediction2020\survey_results_public.csv")


In [20]:

# Select relevant columns
df = df[["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedComp"]]
df = df.rename(columns={"ConvertedComp": "Salary"})
df = df[df["Salary"].notnull()]
df = df.dropna()
df = df[df["Employment"] == "Employed full-time"]
df = df.drop("Employment", axis=1)


In [21]:

# Simplify country categories
def shorten_categories(categories, cutoff):
    return {cat: (cat if count >= cutoff else "Other") for cat, count in categories.items()}

country_map = shorten_categories(df.Country.value_counts(), 400)
df['Country'] = df['Country'].map(country_map)
df = df[df['Country'] != 'Other']
df = df[df["Salary"].between(10000, 250000)]


In [22]:

# Clean YearsCodePro
def clean_experience(x):
    if x == "More than 50 years":
        return 50
    elif x == "Less than 1 year":
        return 0.5
    return pd.to_numeric(x, errors='coerce')

df["YearsCodePro"] = df["YearsCodePro"].apply(clean_experience)
df = df.dropna()


In [23]:

# Clean EdLevel
def clean_education(x):
    if "Bachelor’s degree" in x:
        return "Bachelor’s degree"
    elif "Master’s degree" in x:
        return "Master’s degree"
    elif "Professional degree" in x or "Other doctoral" in x:
        return "Post grad"
    else:
        return "Less than a Bachelors"

df["EdLevel"] = df["EdLevel"].apply(clean_education)

In [24]:

# Encode categorical columns
le_country = LabelEncoder()
le_education = LabelEncoder()
df["Country"] = le_country.fit_transform(df["Country"])
df["EdLevel"] = le_education.fit_transform(df["EdLevel"])


In [25]:

# Prepare data
X = df.drop("Salary", axis=1)
y = df["Salary"]


In [26]:

# Train models
linear_reg = LinearRegression()
linear_reg.fit(X, y)
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y, linear_reg.predict(X))))

Linear Regression RMSE: 39274.75368318509


In [27]:

dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y)
print("Decision Tree RMSE:", np.sqrt(mean_squared_error(y, dec_tree_reg.predict(X))))


Decision Tree RMSE: 29414.938206831015


In [28]:

random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y)
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y, random_forest_reg.predict(X))))


Random Forest RMSE: 29487.30868635557


In [29]:

# GridSearchCV for Decision Tree
params = {"max_depth": [None, 2, 4, 6, 8, 10, 12]}
gs = GridSearchCV(DecisionTreeRegressor(random_state=0), params, scoring="neg_mean_squared_error")
gs.fit(X, y)
regressor = gs.best_estimator_
print("Tuned Decision Tree RMSE:", np.sqrt(mean_squared_error(y, regressor.predict(X))))


Tuned Decision Tree RMSE: 30428.508362980854


In [30]:

# Save model and encoders
model_data = {"model": regressor, "le_country": le_country, "le_education": le_education}
with open("saved_steps.pkl", "wb") as f:
    pickle.dump(model_data, f)


In [31]:

# Load model and predict example
with open("saved_steps.pkl", "rb") as f:
    data = pickle.load(f)

regressor_loaded = data["model"]
le_country = data["le_country"]
le_education = data["le_education"]


In [32]:

# Predict sample
def preprocess_input(country, education, years_exp):
    X = pd.DataFrame([[country, education, years_exp]], columns=["Country", "EdLevel", "YearsCodePro"])
    X["Country"] = le_country.transform(X["Country"])
    X["EdLevel"] = le_education.transform(X["EdLevel"])
    return X.astype(float)

sample = preprocess_input("United States", "Master’s degree", 15)
prediction = regressor_loaded.predict(sample)
print("Predicted Salary:", "${:,.2f}".format(prediction[0]))

Predicted Salary: $139,427.26
