# Explore the data

https://insights.stackoverflow.com/survey


## Download the data


In [None]:
import urllib.request
import zipfile

# From https://insights.stackoverflow.com/survey
url = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
filehandle, _ = urllib.request.urlretrieve(url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
file = zip_file_object.open('survey_results_public.csv')
content = file.read()
content[0:200]

## Load the CSV into pandas

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

survey_data = pd.read_csv('data/survey_results_public.csv')
survey_data.tail(3)

## Clean the data

In [134]:
# Drop rows with no data
survey_data = survey_data.dropna(subset = ["ConvertedCompYearly"])

# Drop rows with extreme outliers
survey_data = survey_data.drop(survey_data[survey_data['ConvertedCompYearly'] > 400000].index)

# Check if the numbers look reasonable
survey_data[['ConvertedCompYearly']].describe()

Unnamed: 0,ConvertedCompYearly
count,34673.0
mean,82283.28449
std,65903.17731
min,1.0
25%,35904.0
50%,65820.0
75%,111360.0
max,400000.0


## Clean more columns

In [None]:
numeric_features = ['YearsCode', 'YearsCodePro']

for col_name in numeric_features:
    survey_data[col_name] = pd.to_numeric(survey_data[col_name], errors='coerce')
    survey_data = survey_data.dropna(subset = [col_name])  

survey_data[numeric_features].describe()

## Map a column to numbers

In [None]:
survey_data['EdLevelNumeric'] = survey_data['EdLevel'].map({
    'Professional degree (JD, MD, etc.)': 24,
    'Other doctoral degree (Ph.D., Ed.D., etc.)': 22,
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 18,
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 16,
    'Associate degree (A.A., A.S., etc.)': 14,
    'Some college/university study without earning a degree': 13,
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 12,
    'Primary/elementary school': 6,
    'Something else': 0,
})
survey_data = survey_data.dropna(subset = ["EdLevelNumeric"])
numeric_features.append('EdLevelNumeric')
survey_data[["EdLevelNumeric"]].describe()

## Visualize the label column

In [None]:
import matplotlib.pyplot as plt

label = survey_data['ConvertedCompYearly']
fig = plt.figure(figsize=(6, 4))
ax = fig.gca()
ax.hist(label, bins=100)
ax.set_ylabel('Frequency')
ax.axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax.axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

## Visualize the feature columns

In [None]:
for col_name in numeric_features:
    fig = plt.figure(figsize=(6, 4))  # TODO: subplot
    ax = fig.gca()
    feature = survey_data[col_name]
    feature.hist(bins=100, ax = ax)
    ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2)
    ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2)
    ax.set_title(col_name)
# todo: a cell above it thats ignored that has all the matplotlib config code and is called from here

## Categorical features

In [None]:
# plot a bar plot for each categorical feature count
categorical_features = ['Age','Gender','Trans', 'EdLevel']

for col in categorical_features:
    counts = survey_data[col].value_counts().sort_index()
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    counts.plot.bar(ax = ax, color='steelblue')
    ax.set_title(col + ' counts')
    ax.set_xlabel(col) 
    ax.set_ylabel("Frequency")

## Measure correlations

In [None]:
for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = survey_data[col]
    label = survey_data['ConvertedCompYearly']
    correlation = feature.corr(label)
    plt.scatter(x=feature, y=label)
    plt.xlabel(col)
    plt.ylabel('Yearly Comp')
    ax.set_title('comp vs ' + col + '- correlation: ' + str(correlation))


# Build a model

## Separate test and train data

In [None]:
# Separate features and labels
X, y = survey_data[['YearsCode','YearsCodePro', 'EdLevelNumeric']].values, survey_data['ConvertedCompYearly'].values
print('Features:', X[:10], '\nLabels:', y[:10], sep='\n')

In [None]:
from sklearn.model_selection import train_test_split
# TODO: scikit learn just got a pandas integration, so its easy to pass dfs back and forth

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))

## Train the model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train, y_train)
print(model)

## Evaluate model on test data

In [None]:
import numpy as np

predictions = model.predict(X_test)
np.set_printoptions(suppress=True)
print('Predicted labels: ', np.round(predictions)[:10])
print('Actual labels   : ', y_test[:10])

## Visualize the predictions

In [None]:
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
plt.title('Yearly Comp Predictions')
# Overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='magenta')

## Calculate evaluation metrics

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, predictions)
print("R2:", r2) # TODO: be able to explain better. residuals squared. higher is better.

# Experiment with more models

* **Linear algorithms**: Not just the Linear Regression algorithm we used above (which is technically an Ordinary Least Squares algorithm), but other variants such as Lasso and Ridge.
* **Tree-based algorithms**: Algorithms that build a decision tree to reach a prediction.
* **Ensemble algorithms**: Algorithms that combine the outputs of multiple base algorithms to improve generalizability.

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

## Generalize the evaluation process

In [None]:
def evaluate_model():
    # Evaluate the model using the test data
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    print("MSE:", mse, " RMSE:", rmse, " R2:", r2, )

    # Plot predicted vs actual
    plt.scatter(y_test, predictions)
    plt.xlabel('Actual Labels')
    plt.ylabel('Predicted Labels')
    plt.title('Yearly Comp Predictions')
    # Overlay the regression line
    z = np.polyfit(y_test, predictions, 1)
    p = np.poly1d(z)
    plt.plot(y_test,p(y_test), color='magenta')

## Lasso (linear regression)

Lasso works well when only a few features predict the label.

https://scikit-learn.org/stable/modules/linear_model.html#lasso

In [None]:
from sklearn.linear_model import Lasso

# Fit a lasso model on the training set
model = Lasso().fit(X_train, y_train)

evaluate_model()

## Decision tree

Decision trees can be used for both regression and classification problems.

https://scikit-learn.org/stable/modules/tree.html#

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

model = DecisionTreeRegressor().fit(X_train, y_train)

# Visualize the model tree
tree = export_text(model)
print(tree)
# TODO: graphviz trees: https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html

## Decision tree (evaluation)

In [None]:
evaluate_model()

## Random forest (ensemble)

Applies an averaging function to multiple Decision Tree models for a better overall model
https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor().fit(X_train, y_train)

evaluate_model() # TODO: Add to a pandas dataframe to show our progress so far

## Gradient tree boosting

https://scikit-learn.org/stable/modules/ensemble.html#gradient-tree-boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor().fit(X_train, y_train)

evaluate_model()

## Improve model

* Tune hyperparameters
* Preprocess data

https://learn.microsoft.com/en-us/training/modules/train-evaluate-regression-models/6-improve-models
    

## Tune hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

# Use a Gradient Boosting algorithm
alg = GradientBoostingRegressor()

# Try these hyperparameter values
params = {
 'learning_rate': [0.1, 0.5, 1.0],
 'n_estimators' : [50, 100, 150]
 }

# Find the best hyperparameter combination to optimize the R2 metric
score = make_scorer(r2_score)
gridsearch = GridSearchCV(alg, params, scoring=score, cv=3, return_train_score=True)
gridsearch.fit(X_train, y_train)
print("Best parameter combination:", gridsearch.best_params_, "\n")

# Get the best model
model = gridsearch.best_estimator_ # TODO but why
print(model, "\n")

## Evaluate tuned model

In [None]:
evaluate_model()

## Pre-processing pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Separate features and labels
X, y = survey_data[['YearsCode','YearsCodePro', 'EdLevel', 'MainBranch', 'Country']].values, survey_data['ConvertedCompYearly'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Define preprocessing for numeric columns (scale them)
numeric_features = [0, 1]
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (encode them)
categorical_features = [2, 3, 4]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

# fit the pipeline to train a linear regression model on the training set
model = pipeline.fit(X_train, (y_train))

## Evaluate tuned model

In [None]:
evaluate_model() # TODO: why the cap

## Store the model

In [None]:
import joblib

# Save the model as a pickle file
filename = './function/yearly-comp.pkl'
joblib.dump(model, filename)

# TODO hdf5 - compression format - what is most performant for pickling/depickling?
# cafe and tf files are other types of model files
# or pickle? onyx
# TODO: read scikit learn's page

## Use the stored model

In [None]:
# Load the model from the file
loaded_model = joblib.load(filename)

# Create a numpy array containing a new observation (for example tomorrow's seasonal and weather forecast information)
X_new = np.array([[25, 15, 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)','I am a developer by profession', ' ']])
#X_new = np.array([[8, 6, 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)','I am a developer by profession', 'United States of America']])
print ('New sample: {}'.format(list(X_new[0])))

# Use the model to predict tomorrow's rentals
result = loaded_model.predict(X_new)
print('Prediction: ${:.0f}'.format(np.round(result[0])))

In [None]:
uniques = []
for country in survey_data['Country'].unique():
    var_name = country.replace(' ', '_').upper() # todo: slugify
    #print(var_name + ' = "' + country + '"')

survey_data['MainBranch'].unique()
# TODO: generate enums.py and save it