<a href="https://colab.research.google.com/github/pdomins/bayesian-learning/blob/master/ej1_bayes_brits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd

In [16]:
df = pd.read_excel("2-PreferenciasBritanicos(TP1).xlsx")
df

Unnamed: 0,scones,cerveza,wiskey,avena,futbol,Nacionalidad
0,0,0,1,1,1,I
1,1,0,1,1,0,I
2,1,1,0,0,1,I
3,1,1,0,0,0,I
4,0,1,0,0,1,I
5,0,0,0,1,0,I
6,1,0,0,1,1,E
7,1,1,0,0,1,E
8,1,1,1,1,0,E
9,1,1,0,1,0,E


### Generic Naive Bayes

In [17]:
def calculate_conditional_probs(df, attributes, out_label, laplace_smoothing=False):
    conditional_probs = {}
    for attribute in attributes:
        conditional_probs[attribute] = {}
        for characteristic in df[out_label].unique():
            data_by_out_label = df[df[out_label] == characteristic]
            if laplace_smoothing:
                attr_count_by_characteristic = data_by_out_label[attribute].value_counts()
                k = len(df[attribute].unique())
                total_samples = len(data_by_out_label)
                for value in df[attribute].unique():
                    conditional_probs[attribute][(value, characteristic)] = (attr_count_by_characteristic.get(value, 0) + 1) / (total_samples + k)
            else:
                for value in df[attribute].unique():
                    attr_count_by_characteristic = data_by_out_label[attribute].value_counts(normalize=True)
                    conditional_probs[attribute][(value, characteristic)] = attr_count_by_characteristic.get(value, 0)

    return conditional_probs

def get_probabilities_multiplication(df, attributes, out_label, out_label_probs, output_df, conditional_probs):
    probabilities = {}
    for example_index, example_values in output_df.iterrows():
        prob_dict = {}
        for characteristic in df[out_label].unique():
            prob = out_label_probs[characteristic]
            for attribute in example_values.index:
                value = example_values[attribute]
                prob *= conditional_probs[attribute][(value, characteristic)]
            prob_dict[characteristic] = prob
        prob_dict['max'] = max(prob_dict, key=prob_dict.get)
        probabilities[example_index] = prob_dict

    return probabilities

def get_prediction(df, output_df, laplace_smoothing=False):
    attributes = df.columns.tolist()
    out_label = attributes.pop()
    out_label_probs = df[out_label].value_counts(normalize=True)
    conditional_probs = calculate_conditional_probs(df, attributes, out_label, laplace_smoothing)
    return get_probabilities_multiplication(df, attributes, out_label, out_label_probs, output_df, conditional_probs)

In [18]:
data = {
    "scones":   [1, 0],
    "cerveza":  [0, 1],
    "wiskey":   [1, 1],
    "avena":    [1, 0],
    "futbol":   [0, 1]
}

output_df = pd.DataFrame(data)

predictions = get_prediction(df, output_df, laplace_smoothing=True)

for example_index, example_values in predictions.items():
    print("Example {}: Predicted Nationality: {}".format(example_index, example_values.get('max')))

Example 0: Predicted Nationality: E
Example 1: Predicted Nationality: I


### Naive Bayes

In [19]:
nationality_probs = df["Nacionalidad"].value_counts(normalize=True)
tastes = ["scones", "cerveza", "wiskey", "avena", "futbol"]

In [20]:
#NO LAPLACE SMOOTHING
conditional_probs = {}

for taste in tastes:
  conditional_probs[taste] = {}
  for nationality in df["Nacionalidad"].unique():
    data_by_nationality = df[df["Nacionalidad"] == nationality]
    tastes_amount_by_nationality = data_by_nationality[taste].value_counts(normalize=True)

    for value in df[taste].unique():
      conditional_probs[taste][(value, nationality)] = tastes_amount_by_nationality.get(value, 0)

conditional_probs

{'scones': {(0, 'I'): 0.5, (1, 'I'): 0.5, (0, 'E'): 0, (1, 'E'): 1.0},
 'cerveza': {(0, 'I'): 0.5,
  (1, 'I'): 0.5,
  (0, 'E'): 0.42857142857142855,
  (1, 'E'): 0.5714285714285714},
 'wiskey': {(1, 'I'): 0.3333333333333333,
  (0, 'I'): 0.6666666666666666,
  (1, 'E'): 0.42857142857142855,
  (0, 'E'): 0.5714285714285714},
 'avena': {(1, 'I'): 0.5,
  (0, 'I'): 0.5,
  (1, 'E'): 0.7142857142857143,
  (0, 'E'): 0.2857142857142857},
 'futbol': {(1, 'I'): 0.5,
  (0, 'I'): 0.5,
  (1, 'E'): 0.42857142857142855,
  (0, 'E'): 0.5714285714285714}}

In [21]:
# LAPLACE SMOOTHING
laplace_conditional_probs = {}

for taste in tastes:
  laplace_conditional_probs[taste] = {}
  for nationality in df["Nacionalidad"].unique():
    data_by_nationality = df[df["Nacionalidad"] == nationality]
    tastes_amount_by_nationality = data_by_nationality[taste].value_counts()
    total_samples = len(data_by_nationality[taste])
    k = len(df[taste].unique())

    for value in df[taste].unique():
      laplace_conditional_probs[taste][(value, nationality)] = (tastes_amount_by_nationality.get(value, 0) + 1) / (total_samples + k)

laplace_conditional_probs

{'scones': {(0, 'I'): 0.5,
  (1, 'I'): 0.5,
  (0, 'E'): 0.1111111111111111,
  (1, 'E'): 0.8888888888888888},
 'cerveza': {(0, 'I'): 0.5,
  (1, 'I'): 0.5,
  (0, 'E'): 0.4444444444444444,
  (1, 'E'): 0.5555555555555556},
 'wiskey': {(1, 'I'): 0.375,
  (0, 'I'): 0.625,
  (1, 'E'): 0.4444444444444444,
  (0, 'E'): 0.5555555555555556},
 'avena': {(1, 'I'): 0.5,
  (0, 'I'): 0.5,
  (1, 'E'): 0.6666666666666666,
  (0, 'E'): 0.3333333333333333},
 'futbol': {(1, 'I'): 0.5,
  (0, 'I'): 0.5,
  (1, 'E'): 0.4444444444444444,
  (0, 'E'): 0.5555555555555556}}

In [30]:
example1 = {
    "scones": 1,
    "cerveza":0,
    "wiskey":1,
    "avena":1,
    "futbol":0
}
example2 = {
    "scones": 0,
    "cerveza": 1,
    "wiskey": 1,
    "avena": 0,
    "futbol": 1
}

for ex in [example1, example2]:
  probabilities = {}
  for nationality in df["Nacionalidad"].unique():
      prob = nationality_probs[nationality]
      for taste in tastes:
        value = ex.get(taste)
        # change laplace_conditional_probs to conditional_probs in order to get the probabilities without laplace smoothing
        prob *= laplace_conditional_probs[taste][(value, nationality)]
      probabilities[nationality] = prob

  print(probabilities)
  total_probs = sum(probabilities.values())
  for key, value in probabilities.items():
    print(f"{key} {value/total_probs}")
  print("Predicted Nationality:", max(probabilities, key=probabilities.get))

{'I': 0.010817307692307692, 'E': 0.03501655079158508}
I 0.2360112818367491
E 0.7639887181632509
Predicted Nationality: E
{'I': 0.010817307692307692, 'E': 0.0021885344244740672}
I 0.8317268190077328
E 0.1682731809922671
Predicted Nationality: I


### Classification Error

In [23]:
def get_single_prediction(df, output_df : pd.Series, laplace_smoothing=False):
    return get_prediction(df, output_df.to_frame().T, laplace_smoothing)

In [24]:
def get_single_prediction_label(df, output_df : pd.Series, laplace_smoothing=False):
    return list(get_single_prediction(df, output_df, laplace_smoothing).values())[0]["max"]

In [25]:
from error_functions import compute_classification_error
from df_utils        import get_column_value_dict

ModuleNotFoundError: ignored

In [None]:
train_df = df[["scones", "cerveza", "wiskey", "avena", "futbol"]]
train_df

In [None]:
train_df_label_dict = get_column_value_dict(df, "Nacionalidad")

In [None]:
compute_classification_error(train_df, train_df_label_dict, lambda s : get_single_prediction_label(df, s))

### Cross Validation

In [None]:
from error_functions import k_fold_cross_validation

In [None]:
k_splits = k_fold_cross_validation(df, "Nacionalidad", lambda s : get_single_prediction_label(df, s), 3)

In [None]:
i = 1
for k_split in k_splits:
    print("Split #{}".format(i))
    train = k_split["train"]
    test  = k_split["test"]
    display(train["df"])
    print("Train error:", train["err"])
    display(test["df"])
    print("Test error:", test["err"])
    print("\n------------------------------------------------\n")
    i += 1