# Part B

In [1]:
import pandas as pd
import numpy as np
from preprocessing import run_pipeline_1, run_pipeline_2
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split


## Exercise 4
What is the simplest baseline model we should aim to beat? Or in other words; if you
would have to make a guess for the salary without knowing anything about the football
player, what would you guess? What is the MAE of such a guess?

In [2]:
# load the data
df = pd.read_csv('../football_wages.csv')
X_processed, y, preprocessor = run_pipeline_2(df)

In [3]:
# the simplest baseline model is the mean of the target variable

y_mean = y.mean()

# the mean absolute error of the baseline model is the sum of the absolute differences between the predicted and actual values
mae_baseline = np.abs(y - y_mean)

print(mae_baseline.mean())



0.49142986701139596


## Exercise 5

Train the KNN and SGD Regressor with default hyperparameters and fairly estimate
their performance for both preprocessing pipelines. Explain why the performance es-
timate is fair and how you estimated the performance.


In [4]:

print("Results for pipeline 1:")
# pipeline 1, KNN Regressor
X_pipeline_1, y_pipeline_1, preprocessor_pipeline_1 = run_pipeline_1(df)
X_train, X_test, y_train, y_test = train_test_split(X_pipeline_1, y_pipeline_1, test_size=0.2, random_state=42)

knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)
mae_knn = np.abs(y_test - y_pred)
print(f'MAE KNN: {mae_knn.mean()}')

# pipeline 1, SGD Regressor
sgd_regressor = SGDRegressor()
sgd_regressor.fit(X_train, y_train)
y_pred = sgd_regressor.predict(X_test)
mae_sgd = np.abs(y_test - y_pred)
print(f'MAE SGD: {mae_sgd.mean()}')

# pipeline 2
X_pipeline_2, y_pipeline_2, preprocessor_pipeline_2 = run_pipeline_2(df)
X_train, X_test, y_train, y_test = train_test_split(X_pipeline_2, y_pipeline_2, test_size=0.2, random_state=42)


print("\nResults for pipeline 2:")
# pipeline 2, KNN Regressor
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)
mae_knn = np.abs(y_test - y_pred)
print(f'MAE KNN: {mae_knn.mean()}')

# pipeline 2, SGD Regressor
sgd_regressor = SGDRegressor()
sgd_regressor.fit(X_train, y_train)
y_pred = sgd_regressor.predict(X_test)
mae_sgd = np.abs(y_test - y_pred)
print(f'MAE SGD: {mae_sgd.mean()}')

Results for pipeline 1:
MAE KNN: 0.2737150948248205
MAE SGD: 0.26130151301954724

Results for pipeline 2:
MAE KNN: 0.280225415740014
MAE SGD: 0.27776175774783707


## Exercise 6
Which pipeline performed the best? Use this pipeline for the next exercises.

### Answer
Pipeline 1 performs better. However in exercise 7 we encounter issues because the nationalities in the autograder test set differ from those in the training set. Therefore we must map the countries to continents to ensure compatibility. Thus we keep using pipeline 2.

## Exercise 7
Submit your your work to the autograder to check your work so far.

In [10]:
# load and preprocess test set
test_set = pd.read_csv('../football_autograder.csv')
X_test, y_test, preprocessor = run_pipeline_2(test_set)
X_pipeline_2, y_pipeline_2, preprocessor_pipeline_2 = run_pipeline_2(df)
X_train = X_pipeline_2
y_train = y_pipeline_2

# fit the KNN and SGD regressors
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train, y_train)

sgd_regressor = SGDRegressor()
sgd_regressor.fit(X_train, y_train)

# predict on test set
knn_pred = knn_regressor.predict(X_test)
SGD_pred = sgd_regressor.predict(X_test)

# save predictions to files
np.savetxt('../autograder_submission_KNN.txt', knn_pred, fmt='%f')
print("Predictions saved to ../autograder_submission_KNN.txt")

np.savetxt('../autograder_submission_SGD.txt', SGD_pred, fmt='%f')
print("Predictions saved to ../autograder_submission_SGD.txt")


Predictions saved to ../autograder_submission_KNN.txt
Predictions saved to ../autograder_submission_SGD.txt
