# 04 F1

In [2]:
import numpy as np

from sklearn.metrics import confusion_matrix

In [3]:
# set seed
np.random.seed(42)

# generate 1d array with value 1 and 0, size 100, with distribution 20% 1 and 80% 0
y_target = np.random.choice([0, 1], size=100, p=[0.8, 0.2])
print(y_target)

[0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


In [4]:
# calculate how many 1 and 0 in the array
unique, counts = np.unique(y_target, return_counts=True)
print(dict(zip(unique, counts)))

{0: 82, 1: 18}


In [5]:
# generate simulated predictions with 20% error rate
y_pred = y_target.copy()
n_errors = int(0.2 * len(y_target))
error_indices = np.random.choice(len(y_target), size=n_errors, replace=False)
y_pred[error_indices] = 1 - y_pred[error_indices]  # flip the values
print(y_pred)

[0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0
 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1
 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0]


In [6]:
cm = confusion_matrix(y_target, y_pred)
print(cm)

[[65 17]
 [ 3 15]]


In [8]:
tn, fp, fn, tp = cm.ravel()
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

# calculate precision, recall, f1 score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

TP: 15, TN: 65, FP: 17, FN: 3
Precision: 0.47
Recall: 0.83
F1 Score: 0.60


F1 score is a balance between precision and recall.
The ideal condition, precision and recall both are high. But in real situations, that’s rarely the case.

Instead of using two separate metrics (precision and recall), we can use the F1 score as a single metric that combines both.
It reflects how well the model maintains a balance between being accurate (precision) and comprehensive (recall).


```
F1 = 2 * (precision * recall) / (precision + recall)
```

The F1 score ranges from 0 to 1. 
* A value close to 1 means the model performs well, with both high precision and recall.
* A value close to 0 means the model performs poorly on one or both metrics.



In case illegal drugs detection:
Precision: Of the bags you stop, how many actually contain illegal stuff?
Recall: Of all illegal bags, how many did you catch?

If you stop every bag, recall is perfect, but precision is awful (you annoy everyone).
If you stop only one bag you’re sure of → precision is perfect, but recall is awful (you missed most).

A high F1 score means you stop the right bags and catch most illegal ones, a good balance.