In [19]:
import pandas as pd
from itertools import product

In [20]:
def list_then_eliminate(train: pd.DataFrame, outcome: str, Hall: pd.DataFrame) -> pd.DataFrame:
    
    # Version space, initially with all possibilities
    version_space = Hall.copy()
    
    # For each training example x...
    for _, x in train.iterrows():

        # ...remove from version_space any non-compatible hypothesis
        query = " & ".join([f"`{k}`=='{v}'" for k, v in x.drop(outcome).items()])
        query = f"{query} & `{outcome}`!={x[outcome]}"
        version_space = version_space.drop(version_space.query(query).index)

    return version_space


In [21]:
df = pd.read_csv("../data/play_sport.csv")

df

Unnamed: 0,sky,airtemp,humidity,wind,water,forecast,enjoysport
0,sunny,warm,normal,strong,warm,same,True
1,sunny,warm,high,strong,warm,same,True
2,rainy,cold,high,strong,warm,change,False
3,sunny,warm,high,strong,cool,change,True


In [22]:
# Complete hypothesis space given training data (compacted)
attrs = df.keys()
H = {key: df[key].unique() for key in attrs}
H

{'sky': array(['sunny', 'rainy'], dtype=object),
 'airtemp': array(['warm', 'cold'], dtype=object),
 'humidity': array(['normal', 'high'], dtype=object),
 'wind': array(['strong'], dtype=object),
 'water': array(['warm', 'cool'], dtype=object),
 'forecast': array(['same', 'change'], dtype=object),
 'enjoysport': array([ True, False])}

In [23]:
# Complete hypothesis space given training data (enumerated)
combinations = list(product(*H.values()))
Hall = pd.DataFrame(combinations, columns=H.keys())

Hall

Unnamed: 0,sky,airtemp,humidity,wind,water,forecast,enjoysport
0,sunny,warm,normal,strong,warm,same,True
1,sunny,warm,normal,strong,warm,same,False
2,sunny,warm,normal,strong,warm,change,True
3,sunny,warm,normal,strong,warm,change,False
4,sunny,warm,normal,strong,cool,same,True
...,...,...,...,...,...,...,...
59,rainy,cold,high,strong,warm,change,False
60,rainy,cold,high,strong,cool,same,True
61,rainy,cold,high,strong,cool,same,False
62,rainy,cold,high,strong,cool,change,True


In [24]:
version_space = list_then_eliminate(df, 'enjoysport', Hall)

print(f"Number of original hypothesis   : {len(Hall)}")
print(f"Number of consistent hypothesis : {len(version_space)}")

version_space

Number of original hypothesis   : 64
Number of consistent hypothesis : 60


Unnamed: 0,sky,airtemp,humidity,wind,water,forecast,enjoysport
0,sunny,warm,normal,strong,warm,same,True
2,sunny,warm,normal,strong,warm,change,True
3,sunny,warm,normal,strong,warm,change,False
4,sunny,warm,normal,strong,cool,same,True
5,sunny,warm,normal,strong,cool,same,False
6,sunny,warm,normal,strong,cool,change,True
7,sunny,warm,normal,strong,cool,change,False
8,sunny,warm,high,strong,warm,same,True
10,sunny,warm,high,strong,warm,change,True
11,sunny,warm,high,strong,warm,change,False
