# Threshold Adjustments

In [4]:
import pandas as pd

data = pd.read_csv('data/player_performance.csv')

data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


ℹ️ Each observation represents a player and each column a characteristic of performance. The target `target_5y` defines whether the player has had a professional career of less than 5 years [0] or 5 years or more [1].

# Preprocessing

In [8]:
from sklearn.preprocessing import RobustScaler

# Instanciate Scaler
scaler = RobustScaler()

# Transform features
X_scaled = scaler.fit_transform(data.drop(columns = 'target_5y'))

# Base modeling

🎯 Goal: Detect players who will last 5 years minimum as professionals, with a 90% guarantee.

👇 Use cross-validation to see if a default Logistic Regression model going to satisfy the coach's requirements.

In [9]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

# 10-Fold Cross validate model
log_cv_results = cross_validate(LogisticRegression(max_iter=1000), X_scaled, data['target_5y'], cv=10, 
                            scoring=['precision'])

# Mean Precision score
base_score = log_cv_results['test_precision'].mean()

base_score

np.float64(0.737761327524343)

#  Threshold adjustment

- Make cross validated probability predictions with cross_val_predict

- Plug the probabilities into precision_recall_curve to generate precision scores at different thresholds

- Find out which threshold guarantees a precision of 0.9

In [11]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve

# Predict probabilities
y_pred_probas_0, y_pred_probas_1 = cross_val_predict(LogisticRegression(),
                                                     X_scaled, data['target_5y'],
                                                     method = "predict_proba").T

# Generate precision and thresholds (and recalls) using probabilities for class 1
precision, recall, thresholds = precision_recall_curve(data['target_5y'], y_pred_probas_1)

# Populate dataframe with precision and threshold
df_precision = pd.DataFrame({"precision" : precision[:-1], "threshold" : thresholds})

print(df_precision)

# Find out which threshold guarantees a precision of 0.9
new_threshold = df_precision[df_precision['precision'] >= 0.9]['threshold'].min()

new_threshold

      precision  threshold
0      0.621988   0.043474
1      0.622457   0.071458
2      0.622926   0.071467
3      0.623396   0.081053
4      0.623112   0.095543
...         ...        ...
1294   1.000000   0.987141
1295   1.000000   0.987451
1296   1.000000   0.987759
1297   1.000000   0.993227
1298   1.000000   0.996932

[1299 rows x 2 columns]


np.float64(0.8666918410449386)

# Using the new threshold

In [13]:
new_player = pd.read_csv("data/ML_New_player.csv")
new_player

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers
0,80,31.4,14.3,5.9,11.1,52.5,0.0,0.1,11.1,2.6,3.9,65.4,3.0,5.0,8.0,2.4,1.1,0.8,2.2


❓ Would you risk recommending the player to the coach?

In [16]:
# Scale the new player's data the same way the feature set was scaled
new_player_scaled = scaler.transform(new_player)


# Instanciate and train model
model = LogisticRegression()
model.fit(X_scaled, data['target_5y'])

# Define custom predict function
def custom_predict(X, custom_threshold):
    probs = model.predict_proba(X) # Get probability of each sample being classified as 0 or 1
    five_year_probs = probs[:, 1] # Only keep probabilities of class [1]
    return (five_year_probs > custom_threshold)
    
custom_prediction = custom_predict(X=new_player_scaled, custom_threshold=new_threshold)[0] # Update predictions 
print(custom_prediction)
recommendation = "recommend"

[[0.05435128 0.94564872]]
[0.94564872]
True
