PART 1

In [2]:
import numpy as np
import pandas as pd
import os

# Load Data

def load_data(filename):
    file_path = os.path.join(os.getcwd(), filename)  
    df = pd.read_csv(file_path)
    return df

# Load all datasets
df_A = load_data("rtg_A.csv")
df_B = load_data("rtg_B.csv")
df_C = load_data("rtg_C.csv")  

# Entropy Calculation
def entropy(y):
    values, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

# Information Gain
def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * entropy(data[data[feature] == values[i]][target]) for i in range(len(values)))
    return total_entropy - weighted_entropy


class DecisionTree:
    def __init__(self, threshold=0.00001):
        self.tree = {}
        self.threshold = threshold

    def build_tree(self, data, features, target):
        y = data[target]
        if len(np.unique(y)) == 1:  
            return y.iloc[0]

        if not features:
            return y.value_counts().idxmax()

        best_feature = max(features, key=lambda f: information_gain(data, f, target))
        ig = information_gain(data, best_feature, target)

        if ig < self.threshold:
            return y.value_counts().idxmax()

        tree = {best_feature: {}}
        for value in np.unique(data[best_feature]):
            subset = data[data[best_feature] == value]
            tree[best_feature][value] = self.build_tree(subset, [f for f in features if f != best_feature], target)

        return tree

    def fit(self, data, target):
        self.tree = self.build_tree(data, list(data.columns[:-1]), target)

    def print_tree(self, tree=None, indent=""):
        if tree is None:
            tree = self.tree
        if isinstance(tree, dict):
            for key, value in tree.items():
                print(indent + str(key))
                self.print_tree(value, indent + "  ")
        else:
            print(indent + "->", tree)

# Train and print decision tree for each dataset
for df, name in zip([df_A, df_B, df_C], ["rtg_A", "rtg_B", "rtg_C"]):
    print(f"\nDecision Tree for {name}:")
    dt = DecisionTree()
    dt.fit(df, target=df.columns[-1])
    dt.print_tree()



Decision Tree for rtg_A:
att0
  0
    att1
      0
        -> 1
      1
        att2
          0
            -> 1
          1
            -> 0
  1
    -> 1

Decision Tree for rtg_B:
att3
  0
    att0
      0
        att2
          0
            -> 1
          1
            att1
              0
                att4
                  0
                    -> 0
                  1
                    -> 1
              1
                -> 1
      1
        att2
          0
            att1
              0
                -> 1
              1
                -> 0
          1
            att1
              0
                att4
                  0
                    -> 0
                  1
                    -> 1
              1
                -> 1
  1
    att4
      0
        att2
          0
            att1
              0
                att0
                  0
                    -> 0
                  1
                    -> 1
              1
                -> 1
          1


PART 2


In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data
def load_data(filename):
    file_path = os.path.join(os.getcwd(), filename)  # Use current working directory
    df = pd.read_csv(file_path)
    return df

# Load breast cancer dataset
df_bc = load_data("breast-cancer.csv")

# Handle missing values (mean and median imputation)
imputer_mean = SimpleImputer(strategy="mean")
imputer_median = SimpleImputer(strategy="median")
df_bc.iloc[:, 1:] = imputer_mean.fit_transform(df_bc.iloc[:, 1:])  # Apply mean imputation

# Normalize data (standardization & min-max scaling)
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
X_standard = scaler_standard.fit_transform(df_bc.iloc[:, 1:])
X_minmax = scaler_minmax.fit_transform(df_bc.iloc[:, 1:])

y = df_bc.iloc[:, 0].apply(lambda x: 1 if x == "M" else 0)  # Convert labels to binary
X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=0.3, random_state=42)

# Train classifiers with different hyperparameters
models = {
    "KNN": [KNeighborsClassifier(n_neighbors=k) for k in [3, 9, 15, 21]],
    "Decision Tree": [DecisionTreeClassifier(max_depth=d) for d in [2, 8, 14]],
    "AdaBoost": [AdaBoostClassifier(n_estimators=n) for n in [10, 20, 30]],
    "Random Forest": [RandomForestClassifier(n_estimators=n) for n in [10, 30, 50, 60]]
}

# Evaluate classifiers
results = []
for model_name, model_list in models.items():
    for model in model_list:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results.append([model_name, str(model.get_params()),
                        accuracy_score(y_test, y_pred),
                        precision_score(y_test, y_pred),
                        recall_score(y_test, y_pred),
                        f1_score(y_test, y_pred)])

# Convert results to DataFrame and display
results_df = pd.DataFrame(results, columns=["Classifier", "Hyperparameters", "Accuracy", "Precision", "Recall", "F1 Score"])
print(results_df)

# Pearson Correlation for Feature Selection
correlation_matrix = df_bc.corr()
selected_features = correlation_matrix[abs(correlation_matrix["diagnosis"]) > 0.6].index
X_selected = df_bc[selected_features]

# PCA for Dimensionality Reduction
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_standard)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# k-Fold Cross Validation
for model_name, model_list in models.items():
    for model in model_list[:2]:  # Only first two hyperparameters for efficiency
        scores = cross_val_score(model, X_standard, y, cv=10, scoring="accuracy")
        print(f"{model_name} ({model.get_params()}): Mean Accuracy: {scores.mean():.4f}, Std Dev: {scores.std():.4f}")


In [4]:
import pandas as pd
data = pd.read_csv('breast-cancer.csv') from sklearn.model_selection import train_test_split

X = data.drop(columns='target')  # Assuming 'target' is the column for cancer type (M/B)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# Example for KNN with different n_neighbors values
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='binary'))
print('Recall:', recall_score(y_test, y_pred, average='binary'))
print('F1 Score:', f1_score(y_test, y_pred, average='binary'))
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': [3, 9, 15, 21]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_
from sklearn.model_selection import cross_val_score

knn_cv = cross_val_score(KNeighborsClassifier(n_neighbors=3), X, y, cv=10, scoring='accuracy')
print("KNN Accuracy:", knn_cv.mean())

from sklearn.impute import SimpleImputer

imputer_mean = SimpleImputer(strategy='mean')
imputer_median = SimpleImputer(strategy='median')

data_imputed_mean = imputer_mean.fit_transform(data)
data_imputed_median = imputer_median.fit_transform(data)
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

data_standardized = scaler_standard.fit_transform(data)
data_minmax_scaled = scaler_minmax.fit_transform(data)
from sklearn.model_selection import train_test_split

X = data.drop(columns='target')  # Assuming 'target' is the column for cancer type (M/B)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# Example for KNN with different n_neighbors values
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='binary'))
print('Recall:', recall_score(y_test, y_pred, average='binary'))
print('F1 Score:', f1_score(y_test, y_pred, average='binary'))
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': [3, 9, 15, 21]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_
from sklearn.model_selection import cross_val_score

knn_cv = cross_val_score(KNeighborsClassifier(n_neighbors=3), X, y, cv=10, scoring='accuracy')
print("KNN Accuracy:", knn_cv.mean())


KeyError: "['target'] not found in axis"