<a href="https://colab.research.google.com/github/lizofeta/uni-git/blob/main/%D0%98%D1%82%D0%BE%D0%B3%D0%BE%D0%B2%D1%8B%D0%B9%D0%9F%D1%80%D0%BE%D0%B5%D0%BA%D1%82_%D0%9A%D0%BE%D0%BD%D0%BE%D0%B2%D0%B0%D0%BB%D0%BE%D0%B2%D0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
import pandas as pd

In [90]:
link = r'https://drive.usercontent.google.com/download?id=1TUwsyfVjl15gsf3YsXEgyCgiJYBDfXCS&export=download&authuser=0'
data = pd.read_csv(link)
data.columns = ['test', 'age', 'status']

In [91]:
data.head()

Unnamed: 0,test,age,status
0,Positive,Young,Infected
1,Positive,Young,Infected
2,Positive,Young,Infected
3,Positive,Old,Infected
4,Positive,Old,Infected


In [92]:
data['status'].value_counts() / data.shape[0] * 100

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Infected,51.079137
Not_infected,48.920863


In [98]:
data.shape[0]

278

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [100]:
X = data[['test', 'age']]
y = data['status']

In [101]:
le_status = LabelEncoder()
y_encoded = le_status.fit_transform(y)
print(f'Encoded y: {list(le_status.classes_)} --> {le_status.transform(le_status.classes_)}')


categorical_features = ['test', 'age']
preprocessor = ColumnTransformer(
    transformers=[
        ('ct', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)


Encoded y: ['Infected', 'Not_infected'] --> [0 1]


In [102]:
model = Pipeline([
    ('preprocessing', preprocessor),
    ('clf', BernoulliNB(alpha=0.5))
])

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y, random_state=42, shuffle=True)

In [104]:
X_train_processed = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out(categorical_features)
print(pd.DataFrame(X_train_processed, columns=feature_names).head())

   ct__test_Negative  ct__test_Positive  ct__age_Old  ct__age_Young
0                1.0                0.0          1.0            0.0
1                1.0                0.0          0.0            1.0
2                1.0                0.0          1.0            0.0
3                0.0                1.0          1.0            0.0
4                0.0                1.0          1.0            0.0


In [105]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [106]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
print(f'Precision: {precision_score(y_test, y_pred):.3f}')
print(f'Recall: {recall_score(y_test, y_pred):.3f}')
print(f'F1-score: {f1_score(y_test, y_pred):.3f}')

Accuracy: 0.786
Precision: 0.742
Recall: 0.852
F1-score: 0.793


In [107]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [108]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    model, X_train, y_train, cv=cv, scoring='f1'
)
print(f'CV F1 mean ± std: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}')

CV F1 mean ± std: 0.703 ± 0.068


In [109]:
alphas = [0.000000001, 0.0000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5, 10]
f1 = {}

for alpha in alphas:

  model = Pipeline([
      ('preprocessor', preprocessor),
      ('clf', BernoulliNB(alpha=alpha))
  ])

  scores = cross_val_score(
      model, X_train, y_train, cv=cv, scoring='f1'
  )
  mean_f1 = scores.mean()
  std_f1 = scores.std()
  f1[alpha] = [mean_f1, std_f1]

best_alpha = -1
best_f1 = -1
for alpha, (mean, std) in f1.items():
  if mean > best_f1:
    best_alpha = alpha
    best_f1 = mean

print(f'Best alpha: {best_alpha}, best f1: {best_f1:.3f}')

Best alpha: 1e-09, best f1: 0.703
