In [7]:
# Import Statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import math

In [45]:
#data cleaning and preprocessing


def process_data_to_dataframe(file_path,nc):
    # Read the contents of the .data file
    with open(file_path, 'r') as data_file:
        data_content = data_file.readlines()

    # Process the data_content and convert it to a list of lists representing the tabular data
    tabular_data = []
    for line in data_content:
        # Split the line using commas to get individual data entries
        entry = line.strip().split(',')
        tabular_data.append(entry)

    # Convert the list of lists to a pandas DataFrame
    df = pd.DataFrame(tabular_data)

    # Separate the first column (labels) from the rest (features)
    labels = df.iloc[:, nc]
    features = df.iloc[:, :nc]

    # Reset the index of the features DataFrame to ensure correct alignment during concatenation
    features.reset_index(drop=True, inplace=True)

    # Transpose the DataFrame to get the desired output, with the labels as the first row and the features as columns
    df= pd.concat([labels, features], axis=1)

    return df
df_train = process_data_to_dataframe('breast-cancer-wisconsin.data',10)

df_train = df_train.rename(columns={10: 'Class',9:'Mitoses',8: 'Normal Nucleoli',7:'Bland Chromatin',6:'Bare Nuclei',5:'Single Epithelial Cell Size',4:'Marginal Adhesion',3: 'Uniformity of Cell Shape', 2:'Uniformity of Cell Size',1:'Clump Thickness',0:'Sample Code number'})

df_train.head(20)

Unnamed: 0,Class,Sample Code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,2,1000025,5,1,1,1,2,1,3,1,1
1,2,1002945,5,4,4,5,7,10,3,2,1
2,2,1015425,3,1,1,1,2,2,3,1,1
3,2,1016277,6,8,8,1,3,4,3,7,1
4,2,1017023,4,1,1,3,2,1,3,1,1
5,4,1017122,8,10,10,8,7,10,9,7,1
6,2,1018099,1,1,1,1,2,10,3,1,1
7,2,1018561,2,1,2,1,2,1,3,1,1
8,2,1033078,2,1,1,1,2,1,1,1,5
9,2,1033078,4,2,1,1,2,1,2,1,1


In [48]:
df_test = process_data_to_dataframe('test.data',9)
df_test = df_test.rename(columns={9:'Mitoses',8: 'Normal Nucleoli',7:'Bland Chromatin',6:'Bare Nuclei',5:'Single Epithelial Cell Size',4:'Marginal Adhesion',3: 'Uniformity of Cell Shape', 2:'Uniformity of Cell Size',1:'Clump Thickness',0:'Sample Code number'})

df_test.head

<bound method NDFrame.head of     Mitoses Sample Code number Clump Thickness Uniformity of Cell Size  \
0         1            1002945               5                       4   
1         1            1016277               6                       8   
2         1            1018099               1                       1   
3         5            1033078               2                       1   
4         1            1035283               1                       1   
..      ...                ...             ...                     ...   
195       1            1076352               3                       6   
196       1            1119189               5                       8   
197       7            1286943               8                      10   
198       1            1313325               4                      10   
199       1             412300              10                       4   

    Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size  \
0  

In [34]:
def Entropy(data):
    data['Class'] = data['Class'].astype(int)
    n2 = np.sum(data['Class'] == 2)  # number of 2's
    n4 = np.sum(data['Class'] == 4)  # number of 4's
    n = n2 + n4
    if n == 0:
        return 0  # Handle the case when n is zero to avoid division by zero
    p2 = n2 / n
    p4 = n4 / n

    if p2 == 0 or p4 == 0:
        return 0  # Handle the case when p2 or p4 is zero to avoid NaN in logarithm
    entropy = -(p2)*np.log2(p2)-(p4)*np.log2(p4) 
    
    return entropy
          

In [None]:
#Function by Hugh Liu
def infogain(data, feature, threshold):
    ''' Calculates the information gain for a given feature and threshold'''
    count = len(data)
    d1 = data[data[:, feature - 1] <= threshold] 
    d2 = data[data[:, feature - 1] > threshold]
    proportion_d1 = len(d1) / count
    proportion_d2 = len(d2) / count
    return Entropy(data) - proportion_d1 * Entropy(d1) - proportion_d2 * Entropy(d2)



In [None]:
#Function by Hugh Liu
def get_best_split(data, feature_list, threshold_list):
    ''' Calculates Max Info Gain, computes the threshold and returns the feature, threshold, and predictions for left and right nodes'''
    c = len(data)
    
    # c0 is the number of instances with class label 2
    c0 = sum(b[-1] == 2 for b in data)

    # if all instances have class label 2, return 2
    # else if all instances have class label 4, return 4
    if c0 == c: return 2, None, None, None
    if c0 == 0: return 4, None, None, None

    # compute possible information gain for all features and thresholds
    # pairwise combinations
    ig = [[infogain(
        data, feature, threshold) for threshold in threshold_list] for feature in feature_list]
    
    # convert ig to numpy array
    ig = np.array(ig)
    
    # find the maximum information gain
    max_ig = max(max(i) for i in ig)

    # if max_ig is 0, return 2 if c0 >= c - c0, else return 4
    # remember c0 is the number of instances with class label 2
    # and c - c0 is the number of instances with class label 4
    if max_ig == 0:
        if c0 >= c - c0:
            return 2, None, None, None
        else:
            return 4, None, None, None

    # can also return max_ig in case you need it for debugging
    
    # find the index of the maximum information gain
    idx = np.unravel_index(np.argmax(ig, axis=None), ig.shape)

    # return the feature, threshold, and predictions for left and right nodes
    feature, threshold = feature_list[idx[0]], threshold_list[idx[1]]

    # binary split: split the data into two parts based on the threshold
    dl = data[data[:, feature - 1] <= threshold]
    dr = data[data[:, feature - 1] > threshold]

    # get the number of instances with class label 2 and 4 in the left node
    dl_n2 = np.sum(dl[:,-1] == 2)
    dl_n4 = np.sum(dl[:,-1] == 4)

    # if the number of instances with class label 2 is greater than or equal to 4, predict 2
    if dl_n2 >= dl_n4:
        dl_prediction = 2
    else:
        # else predict 4
        dl_prediction = 4
    
    # get the number of instances with class label 2 and 4 in the left node
    dr_n2 = np.sum(dr[:,-1] == 2)
    dr_n4 = np.sum(dr[:,-1] == 4)

    # if the number of instances with class label 2 is greater than or equal to 4, predict 2
    if dr_n2 >= dl_n4:
        dr_prediction = 2
    else:
        # else predict 4
        dr_prediction = 4
    return feature, threshold, dl_prediction, dr_prediction

In [37]:
#Question 1
df_train['Class'] = df_train['Class'].astype(int)
n2 = df_train['Class'].value_counts().get(2, 0)  # number of 2's
n4 = df_train['Class'].value_counts().get(4, 0)  # number of 4's
print(n2)
print(n4)


458
241


In [36]:
# Question 2
print(Entropy(df_train))

0.9293179372497982


In [51]:
#Question 3
threshold_list =range(0,10)
feature_list = [7, 3, 9, 5, 8, 10]
for threshold in threshold_list:
    print(infogain(df_train,feature_list,threshold))

NameError: name 'infogain' is not defined