In [2]:
import induceC45
import classify
import pandas as pd
import numpy as np

from typing import List, Any, Optional, Tuple

In [72]:
data = pd.read_csv('data/heart.csv', skiprows=[1, 2], header=0)
D = data.drop('HeartDisease', axis=1)
C = data['HeartDisease']
A = {a: list(D[a].unique()) for a in list(D.columns)}

In [73]:
D.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 79.0+ KB


AttributeError: module 'numpy' has no attribute 'numerictypes'

In [49]:
def dataset_selection(D: pd.DataFrame, A: List[str], C: pd.Series,
                      num_attrs: int, num_obs: int) -> Tuple[pd.DataFrame, pd.Series]:
    DC = D.loc[:, :]
    DC['class'] = C
    A_rand = pd.Series(A).sample(n=num_attrs).to_list()
    DC_rand = DC.sample(n=num_obs, replace=True)[A_rand + ['class']]
    return DC_rand[A_rand], DC_rand['class']

In [50]:
dataset_selection(D, A, C, 5, 100)

(     gill-color stalk-root gill-attachment ring-type cap-color
 6050          b          ?               f         e         n
 770           w          c               f         p         w
 1295          k          e               f         e         g
 325           k          c               f         p         w
 4286          p          b               f         l         g
 ...         ...        ...             ...       ...       ...
 5859          w          ?               f         e         b
 4249          p          b               f         l         g
 2694          p          b               f         p         g
 8022          b          ?               f         e         e
 5871          b          ?               f         e         n
 
 [100 rows x 5 columns],
 6050    p
 770     e
 1295    e
 325     e
 4286    p
        ..
 5859    e
 4249    p
 2694    e
 8022    p
 5871    p
 Name: class, Length: 100, dtype: object)

In [51]:
def random_forest(D: pd.DataFrame, A: List[str], C: pd.Series,
                  num_attrs: int, num_obs: int, num_trees: int,
                  threshold: float, gratio: bool = False) -> List[dict]:
    rf_trees = []
    for i in range(num_trees):
        D_train, C_train = dataset_selection(D, A, C, num_attrs, num_obs)
        tree = induceC45.C45(D_train, list(D_train.columns), C_train, threshold, gratio=gratio)
        rf_trees.append(tree)

    return rf_trees

In [52]:
def rf_predict(df: pd.DataFrame, trees: List[dict]) -> pd.DataFrame:
    preds = {}
    for i in range(len(trees)):
        preds[i] = D.apply(lambda row: classify.search_tree(row, trees[i]), axis=1)
    preds = pd.DataFrame(preds)
    df['pred'] = preds.apply(lambda row: row.mode().iloc[0], axis=1)
    return df

In [53]:
trees = random_forest(D, A, C, 5, 100, 5, 0.5)
trees

[{'dataset': '', 'leaf': {'decision': 'p', 'p': 0.54}},
 {'dataset': '',
  'node': {'var': 'odor',
   'edges': [{'edge': {'value': 'f', 'leaf': {'decision': 'p', 'p': 1}}},
    {'edge': {'value': 's', 'leaf': {'decision': 'p', 'p': 1}}},
    {'edge': {'value': 'n',
      'leaf': {'decision': 'e', 'p': 0.9761904761904762}}},
    {'edge': {'value': 'y', 'leaf': {'decision': 'p', 'p': 1}}},
    {'edge': {'value': 'l', 'leaf': {'decision': 'e', 'p': 1}}},
    {'edge': {'value': 'c', 'leaf': {'decision': 'p', 'p': 1}}},
    {'edge': {'value': 'a', 'leaf': {'decision': 'e', 'p': 1}}},
    {'edge': {'value': 'm', 'leaf': {'decision': 'p', 'p': 1}}}]}},
 {'dataset': '', 'leaf': {'decision': 'e', 'p': 0.52}},
 {'dataset': '', 'leaf': {'decision': 'e', 'p': 0.51}},
 {'dataset': '', 'leaf': {'decision': 'e', 'p': 0.51}}]

In [19]:
trees[0]

{'dataset': '',
 'node': {'var': 'health',
  'edges': [{'edge': {'value': 'priority',
     'leaf': {'decision': 'spec_prior', 'p': 0.6153846153846154}}},
   {'edge': {'value': 'not_recom', 'leaf': {'decision': 'not_recom', 'p': 1}}},
   {'edge': {'value': 'recommended',
     'leaf': {'decision': 'priority', 'p': 0.6}}}]}}

In [55]:
pred = rf_predict(data, trees)
pred

KeyboardInterrupt: 

In [33]:
classify.report(pred, 'class', 'pred')

{'df':           parents   has_nurs      form children     housing     finance  \
 0           usual     proper  complete        1  convenient  convenient   
 1           usual     proper  complete        1  convenient  convenient   
 2           usual     proper  complete        1  convenient  convenient   
 3           usual     proper  complete        1  convenient  convenient   
 4           usual     proper  complete        1  convenient  convenient   
 ...           ...        ...       ...      ...         ...         ...   
 12955  great_pret  very_crit    foster     more    critical      inconv   
 12956  great_pret  very_crit    foster     more    critical      inconv   
 12957  great_pret  very_crit    foster     more    critical      inconv   
 12958  great_pret  very_crit    foster     more    critical      inconv   
 12959  great_pret  very_crit    foster     more    critical      inconv   
 
               social       health       class        pred  
 0            nonpr

In [None]:
def rf_report(splits: List[pd.DataFrame], classvar: str,
              num_attrs: int, num_obs: int, num_trees: int,
              threshold: float, gratio: bool = False,
              restrict: Optional[List[int]] = None) -> dict:
    preds = []
    results = []

    for i in range(len(splits)):
        # Take fold i as test data
        test = splits[i].loc[:]

        # Take rest of folds, combine, and fit tree.
        train = splits[:]
        del train[i]
        train = pd.concat(train)

        C_train = train[classvar]
        D_train = train.drop(classvar, axis=1)
        A_train = list(D_train.columns)
        if restrict is not None:
            A_train = [
                A_train[i] for i in range(len(A_train)) if restrict[i] == 1
            ]

        trees = random_forest(D_train, A_train, C_train,
                              num_attrs, num_obs, num_trees,
                              threshold, gratio=gratio)

        pred = rf_predict(test, trees)
        preds.append(pred)
        results.append(classify.report(pred, classvar, 'pred'))

    preds = pd.concat(preds)
    results = pd.DataFrame(results)
    cv_report = classify.report(preds, classvar, 'pred')
    cv_report['avg_accuracy'] = results['accuracy'].mean()
    return cv_report

In [44]:
tree = {"dataset": "", "node": {"var": "odor", "edges": [{"edge": {"value": "n", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "f", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "a", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "y", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "l", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "c", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "s", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "m", "leaf": {"decision": "p", "p": 1}}}]}}


In [56]:
tree

{'dataset': '',
 'node': {'var': 'odor',
  'edges': [{'edge': {'value': 'n', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'f', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'a', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'y', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'l', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'c', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 's', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'm', 'leaf': {'decision': 'p', 'p': 1}}}]}}

In [58]:
data['odor'].value_counts()

odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64

In [57]:
classify.predict(data, tree)

KeyboardInterrupt: 

In [13]:
crx = pd.read_csv('../data/crx.data.csv', skiprows=[1, 2], header=0, na_values="?")
crx

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [26]:
df = crx

numeric_columns = df.select_dtypes(include=[np.number]).columns

# Identify string columns
string_columns = df.select_dtypes(include=[object]).columns

# Impute missing values in numeric columns with their respective medians
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# Impute missing values in string columns with the most frequent element
for col in string_columns:
    most_frequent_value = df[col].mode()[0]
    df[col] = df[col].fillna(most_frequent_value)

crx

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [27]:
crx.to_csv('../data/crx.data.csv')

In [21]:
crx[string_columns].mode()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A16
0,b,u,g,c,v,t,f,f,g,-
