In [None]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
import itertools


class Node:
    def __init__(self, name, parents, states):
        self.name = name
        self.parents = parents
        self.states = states

    def set_probabilities_table(self, probabilities_table):
        self.probabilities_table = probabilities_table

def get_child_probabilities_table(data_df, parents_node_names, child_node_name):
    return data_df.groupby(parents_node_names + [child_node_name]).size()/ len(data_df.groupby(parents_node_names + [child_node_name]).size())

def categorize_column(data_df, column_name, categories):
    for index, category in enumerate(categories[:-1]):
        data_df.loc[(data_df[column_name] >= category["floor"]) & (
            data_df[column_name] < category["ceiling"]), column_name + ' category'] = str(index)

    data_df.loc[data_df[column_name] >=
                            categories[-1]["floor"], column_name + ' category'] = str(len(categories)-1)

    return data_df


def get_simple_probabilities_table(data_df, node):
    # el nodo tiene un unico padre
    groupby_df = data_df.groupby(list(map(lambda n: n.name, node.parents))+[node.name]).size().reset_index(name="appearances")
    groupby_df[node.name] = groupby_df[node.name].astype(int) # no puedo evitar esto
    probabilities_table = pd.DataFrame(columns=[node.parents[0].name, node.name, "frequency"])
    possible_states = list(itertools.product(node.parents[0].states, node.states)) 
    #para que el estado del nodo actual este al final lo agrego ultimo
    
    for i, state in enumerate(possible_states):
        try:
            parent_state_appearances_df = groupby_df.loc[(groupby_df[node.parents[0].name] == state[0])]
            parent_state_appearances = parent_state_appearances_df["appearances"].sum()
        except: parent_state_appearances = 0

        try:
            node_state_appearances = parent_state_appearances_df.loc[(parent_state_appearances_df[node.name] == state[1])]["appearances"].values[0]
        except: node_state_appearances = 0

        probabilities_table.loc[i] =  list(state)+[(node_state_appearances+1)/(parent_state_appearances+len(node.states))]

    #print(probabilities_table)
    return probabilities_table

def get_3parents_probabilities_table(data_df, node):
    groupby_df = data_df.groupby(list(map(lambda n: n.name, node.parents))+[node.name]).size().reset_index(name="appearances")
    groupby_df[node.name] = groupby_df[node.name].astype(int) # no puedo evitar esto
    groupby_df[node.parents[0].name] = groupby_df[node.parents[0].name].astype(int) # no puedo evitar esto
    groupby_df[node.parents[1].name] = groupby_df[node.parents[1].name].astype(int) # no puedo evitar esto
    groupby_df[node.parents[2].name] = groupby_df[node.parents[2].name].astype(int) # no puedo evitar esto
    
    probabilities_table = pd.DataFrame(columns=list(map(lambda n: n.name, node.parents)) + [node.name, "frequency"])
    parents_states = list(map(lambda x: x.states, node.parents))
    possible_states = list(itertools.product(*parents_states, node.states)) 
    #para que el estado del nodo actual este al final lo agrego ultimo

    for i, state in enumerate(possible_states):

        try:
            parent_state_appearances_df = groupby_df.loc[(groupby_df[node.parents[0].name] == state[0])& \
            (groupby_df[node.parents[1].name] == state[1]) & \
            (groupby_df[node.parents[2].name] == state[2])]
            parent_state_appearances = parent_state_appearances_df["appearances"].sum()
        except: parent_state_appearances = 0

        try:
            node_state_appearances = parent_state_appearances_df.loc[(parent_state_appearances_df[node.name] == state[3])]["appearances"].values[0]
        except: node_state_appearances = 0
        
        probabilities_table.loc[i] =  list(state)+[(node_state_appearances+1)/(parent_state_appearances+len(node.states))]

    #print(probabilities_table)
    return probabilities_table



In [None]:
data_df = pd.read_csv("./binary.csv", header=0)

# Caso root: RANK
ROOT_CATEGORY = [1,2,3,4]
ranks_count_df = data_df.groupby(["rank"]).size().reset_index(name="appearances")
ranks_frequency = pd.DataFrame(columns=["rank", "frequency"])
for index, rank in enumerate(ROOT_CATEGORY):
    ranks_frequency.loc[index] = [rank, ranks_count_df.loc[ranks_count_df["rank"] == rank]["appearances"].values[0]/len(data_df)]

root_node = Node("rank", None, ROOT_CATEGORY)
#print(ranks_frequency)
root_node.set_probabilities_table(ranks_frequency)

# Caso 1: GRE
GRE_CATEGORY =[{
    "floor": 0,
    "ceiling": 500
},{
    "floor": 500,
}]
data_df = categorize_column(data_df, "gre", GRE_CATEGORY)
gre_node = Node("gre category", [root_node], list(range(len(GRE_CATEGORY))))

gre_frequency = get_simple_probabilities_table(data_df, gre_node)
gre_node.set_probabilities_table(gre_frequency)

# Caso 2: GPA
GPA_CATEGORY =[{
    "floor": 0,
    "ceiling": 3
},{
    "floor": 3,
}]
data_df = categorize_column(data_df, "gpa", GPA_CATEGORY)
gpa_node = Node("gpa category", [root_node], list(range(len(GPA_CATEGORY))))
gpa_frequency = get_simple_probabilities_table(data_df, gpa_node)
gpa_node.set_probabilities_table(gpa_frequency)

admit_node = Node("admit", [root_node, gre_node, gpa_node], [0,1])
admit_frequency = get_3parents_probabilities_table(data_df, admit_node)
admit_node.set_probabilities_table(admit_frequency)


In [None]:
# a) Calcular la probabilidad de que una persona que proviene de una escuela con rango 1 no haya sido admitida en la universidad.
numerator = 0
for i, gpa_state in enumerate(gpa_node.states):
    for j, gre_state in enumerate(gre_node.states):
        # Usando teorema de la factorizacion de la probabilidad
        numerator += \
            admit_node.probabilities_table.loc[
            (admit_node.probabilities_table["rank"] == 1) & \
            (admit_node.probabilities_table["gpa category"] == gpa_state) & \
            (admit_node.probabilities_table["gre category"] == gre_state) & \
            (admit_node.probabilities_table["admit"] == 0)]["frequency"].values[0] * \
                \
            gpa_node.probabilities_table.loc[
            (gpa_node.probabilities_table["rank"] == 1) & \
                (gpa_node.probabilities_table["gpa category"] == gpa_state)]["frequency"].values[0] * \
                \
            gre_node.probabilities_table.loc[
            (gre_node.probabilities_table["rank"] == 1) & \
                (gre_node.probabilities_table["gre category"] == gre_state)]["frequency"].values[0] * \
                \
            root_node.probabilities_table.loc[
            (root_node.probabilities_table["rank"] == 1)]["frequency"].values[0]

denominator = root_node.probabilities_table.loc[
            (root_node.probabilities_table["rank"] == 1)]['frequency'][0]
            
print("prob admitido´nt: ", numerator/denominator)

# b) Calcular la probabilidad de que una persona que proviene de una escuela con rango 2, GPA 1 y GRE 0 haya sido admitida en la universidad.
prob_admitted = admit_node.probabilities_table.loc[
            (admit_node.probabilities_table["rank"] == 2) & \
            (admit_node.probabilities_table["gpa category"] == 1) & \
            (admit_node.probabilities_table["gre category"] == 0) & \
            (admit_node.probabilities_table["admit"] == 1)]["frequency"].values[0]
print("prob admitido: ", prob_admitted)