<a href="https://colab.research.google.com/github/mbrudd/csci290/blob/main/notebooks/Attribute_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the [Breast Cancer dataset](https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original) from the UCI Machine Learning Repository

In [73]:
pip install ucimlrepo



In [74]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

# metadata
# print(breast_cancer_wisconsin_original.metadata)

# variable information
# print(breast_cancer_wisconsin_original.variables)


In [75]:
import pandas as pd
import math

## Explore the data a bit

In [76]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Clump_thickness              699 non-null    int64  
 1   Uniformity_of_cell_size      699 non-null    int64  
 2   Uniformity_of_cell_shape     699 non-null    int64  
 3   Marginal_adhesion            699 non-null    int64  
 4   Single_epithelial_cell_size  699 non-null    int64  
 5   Bare_nuclei                  683 non-null    float64
 6   Bland_chromatin              699 non-null    int64  
 7   Normal_nucleoli              699 non-null    int64  
 8   Mitoses                      699 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 49.3 KB


In [77]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Class   699 non-null    int64
dtypes: int64(1)
memory usage: 5.6 KB


In [78]:
df = pd.concat( [X,y], axis=1 )
df.head()

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [79]:
df["Class"].unique()

array([2, 4])

In [80]:
features = df.columns[ df.columns != "Class" ]
features

Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses'],
      dtype='object')

In [81]:
df.columns[0:(len(df.columns)-1)]

Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses'],
      dtype='object')

## The number of unique values can help distinguish between categorical and quantitative features

In [82]:
df.nunique()

Unnamed: 0,0
Clump_thickness,10
Uniformity_of_cell_size,10
Uniformity_of_cell_shape,10
Marginal_adhesion,10
Single_epithelial_cell_size,10
Bare_nuclei,10
Bland_chromatin,10
Normal_nucleoli,10
Mitoses,9
Class,2


## Specify the target and feature of interest

In [83]:
target = "Class"
attribute = "Clump_thickness"

In [84]:
vals = df[ attribute].unique()
vals

array([ 5,  3,  6,  4,  8,  1,  2,  7, 10,  9])

In [85]:
props = df[ df[attribute] == vals[8] ][target].value_counts( normalize=True )
props

Unnamed: 0_level_0,proportion
Class,Unnamed: 1_level_1
4,1.0


In [86]:
entropy = 0
for p in props.array:
  entropy = entropy - p*math.log2(p)
entropy

0.0

In [87]:
entropy = 0
for i in range(len(props)):
  entropy = entropy - props.iloc[i]*math.log2( props.iloc[i] )
entropy

0.0

## Calculate the entropy for splitting on a categorical feature

In [None]:
entropy = 0
overall = len( df )
for val in vals:
  subset_size = len(df[ df[attribute] == val ])
  weight = subset_size / overall
  props = df[ df[attribute] == val ][target].value_counts( normalize=True )
  for p in props.array:
    entropy =  entropy - weight*(p*math.log2(p))

entropy

0.4645896346043592

# Load the [Blood Transfusion dataset](https://archive.ics.uci.edu/dataset/176/blood+transfusion+service+center) from the UCI Machine Learning Repository

In [88]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
blood_transfusion_service_center = fetch_ucirepo(id=176)

# data (as pandas dataframes)
X = blood_transfusion_service_center.data.features
y = blood_transfusion_service_center.data.targets

# metadata
# print(blood_transfusion_service_center.metadata)

# variable information
# print(blood_transfusion_service_center.variables)

## Explore the data a bit

In [89]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Recency    748 non-null    int64
 1   Frequency  748 non-null    int64
 2   Monetary   748 non-null    int64
 3   Time       748 non-null    int64
dtypes: int64(4)
memory usage: 23.5 KB


In [90]:
X.nunique()

Unnamed: 0,0
Recency,31
Frequency,33
Monetary,33
Time,78


In [91]:
df = pd.concat( [X,y], axis=1)
df.head()

Unnamed: 0,Recency,Frequency,Monetary,Time,Donated_Blood
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


## Determine the best split point and calculate entropy for a quantitative feature

In [92]:
target = "Donated_Blood"
feature = "Frequency"

In [94]:
vals = df[ "Frequency" ].unique()
vals.sort()
vals

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 26, 33, 34, 38, 41, 43, 44, 46, 50])

In [101]:
overall = len( df )
entropies = []
for val in vals:
  entropy = 0
  left = df[ df[feature] <= val ][ [feature,target] ]
  props = left[ target ].value_counts( normalize = True )
  weight = len( left ) / overall
  for prop in props.array:
    entropy = entropy - weight*prop*math.log2( prop )
  right = df[ df[feature] > val ][ [feature,target] ]
  props = right[ target ].value_counts( normalize = True )
  weight = len( right ) / overall
  for prop in props.array:
    entropy = entropy - weight*prop*math.log2( prop )
  entropies.append(entropy)

In [103]:
pd.Series( entropies ).idxmin()

3

In [104]:
entropies[3]

0.7589210065850729

In [105]:
vals[3]

4