<a href="https://colab.research.google.com/github/nogaklein00/CodeTest_326364007/blob/main/CodeForTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Question 5

import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import mode

X_all = np.vstack((X_labeled, X_unlabeled))

# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_all)

# Assign Labels Based on Clusters
cluster_labels = {}
for cluster_id in range(2):
    indices = np.where(clusters[:200] == cluster_id)[0] # Only labeled data
    majority_label = mode(y_labeled[indices])[0][0] # Majority vote in the cluster
    cluster_labels[cluster_id] = majority_label # Assign majority label to cluster

pseudo_labels = np.array([cluster_labels[c] for c in clusters[200:]]) # Only for 800 unlabed samples

# Train a New Classifier
X_train = np.vstack((X_labeled, X_unlabeled))
y_train = np.concatenate((y_labeled, pseudo_labels))

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate on the Test Set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [64]:
# Question 6

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

class ContinuousRandomForest:
    def __init__(self, n_estimators_per_phase=50):
        self.n_estimators = n_estimators_per_phase
        self.phase1_model = RandomForestClassifier(n_estimators=n_estimators_per_phase, oob_score=True, random_state=42)
        self.phase2_model = RandomForestClassifier(n_estimators=n_estimators_per_phase, oob_score=True, random_state=43)
        self.is_phase2_trained = False

    def fit_phase1(self, X, y):
        self.phase1_model.fit(X, y)
        return self.phase1_model.oob_score_

    def continue_phase2(self, X, y):
        self.phase2_model.fit(X, y)
        self.is_phase2_trained = True
        return self.get_combined_oob_score(X, y)

    def predict_proba(self, X):
        # Get predictions from phase 1
        phase1_pred = self.phase1_model.predict_proba(X)

        if self.is_phase2_trained:
            # If phase 2 is trained, combine predictions from both phases
            phase2_pred = self.phase2_model.predict_proba(X)
            return (phase1_pred + phase2_pred) / 2
        return phase1_pred

    def get_combined_oob_score(self, X, y):
        preds = self.predict_proba(X)
        predictions = np.argmax(preds, axis=1)
        return np.mean(predictions == y)

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_full = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_full.fit(X_train, y_train)
gb_full = GradientBoostingClassifier(n_estimators=100, random_state=42, subsample=0.8)
gb_full.fit(X_train, y_train)

# Find most important features
rf_importance = rf_full.feature_importances_
gb_importance = gb_full.feature_importances_
rf_top_feature = np.argmax(rf_importance)
gb_top_feature = np.argmax(gb_importance)

print("\nMost Important Features:")
print(f"Random Forest - Most Important Feature: {data.feature_names[rf_top_feature]}")
print(f"Gradient Boosting - Most Important Feature: {data.feature_names[gb_top_feature]}")

# Prepare data for phase 1 (zero out important feature)
X_train_phase1 = X_train.copy()
X_train_phase1[:, rf_top_feature] = 0

# Train continuous random forest
cont_rf = ContinuousRandomForest(n_estimators_per_phase=50)
rf_phase1_oob = cont_rf.fit_phase1(X_train_phase1, y_train)
rf_phase2_oob = cont_rf.continue_phase2(X_train, y_train)

# Train gradient boosting
gb_model = GradientBoostingClassifier(n_estimators=50, random_state=42, subsample=0.8, warm_start=True)
X_train_gb_phase1 = X_train.copy()
X_train_gb_phase1[:, gb_top_feature] = 0
gb_model.fit(X_train_gb_phase1, y_train)
gb_phase1_oob_error = 1 - gb_model.train_score_[-1]

gb_model.n_estimators = 100
gb_model.fit(X_train, y_train)
gb_phase2_oob_error = 1 - gb_model.train_score_[-1]

print("\nModel Performance:")
print("Random Forest:")
print(f"Reference Full Model OOB Score: {rf_full.oob_score_:.4f}")
print(f"Phase 1 (50 iterations, zeroed important feature) OOB Score: {rf_phase1_oob:.4f}")
print(f"Phase 2 (100 iterations combined, restored feature) OOB Score: {rf_phase2_oob:.4f}")

print("\nGradient Boosting:")
print(f"Reference Full Model OOB Score: {1 - gb_full.train_score_[-1]:.4f}")
print(f"Phase 1 (50 iterations, zeroed important feature) OOB-like Error: {gb_phase1_oob_error:.4f}")
print(f"Final (100 iterations, restored feature) OOB-like Error: {gb_phase2_oob_error:.4f}")


Most Important Features:
Random Forest - Most Important Feature: worst area
Gradient Boosting - Most Important Feature: worst concave points

Model Performance:
Random Forest:
Reference Full Model OOB Score: 0.9560
Phase 1 (50 iterations, zeroed important feature) OOB Score: 0.9626
Phase 2 (100 iterations combined, restored feature) OOB Score: 1.0000

Gradient Boosting:
Reference Full Model OOB Score: 0.9933
Phase 1 (50 iterations, zeroed important feature) OOB-like Error: 0.9601
Final (100 iterations, restored feature) OOB-like Error: 0.9921
