### Name=Niharika Navin Nashine 
### mail= niharikanashine2004@gmail.com


## AutoMl

In [1]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [2]:

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
columns = ['ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean', 
           'Smoothness_mean', 'Compactness_mean', 'Concavity_mean', 'Concave_points_mean', 
           'Symmetry_mean', 'Fractal_dimension_mean', 'Radius_se', 'Texture_se', 'Perimeter_se', 
           'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se', 'Concave_points_se', 
           'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst', 'Texture_worst', 'Perimeter_worst', 
           'Area_worst', 'Smoothness_worst', 'Compactness_worst', 'Concavity_worst', 
           'Concave_points_worst', 'Symmetry_worst', 'Fractal_dimension_worst']

df = pd.read_csv(url, header=None, names=columns)
df.drop('ID', axis=1, inplace=True)

In [3]:

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis'].map({'M': 1, 'B': 0})


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
tpot = TPOTClassifier(verbosity=2, generations=3, population_size=10)
tpot.fit(X_train, y_train)

                                                                           
Generation 1 - Current best internal CV score: 0.9714285714285713
                                                                           
Generation 2 - Current best internal CV score: 0.9758241758241757
                                                                           
Generation 3 - Current best internal CV score: 0.9758241758241757
                                                                           
Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=9, max_features=0.4, min_samples_leaf=19, min_samples_split=5, n_estimators=100, subsample=0.35000000000000003)


In [6]:
y_pred = tpot.predict(X_test)
print(tpot.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
tpot.export('best_pipeline.py')

0.9736842105263158
[[67  1]
 [ 2 44]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        68
           1       0.98      0.96      0.97        46

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



Meta-Learning Setup

In [24]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import ModuleList, Sequential, Linear, Flatten, ReLU
from sklearn.model_selection import train_test_split
import torch.nn as nn
import pandas as pd
import numpy as np

# Function to load and preprocess breast cancer dataset
def load_breast_cancer_dataset(url, columns):
    df = pd.read_csv(url, header=None, names=columns)
    df.drop('ID', axis=1, inplace=True)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis'].map({'M': 1, 'B': 0})

    X = X.to_numpy().astype(np.float32)  # Convert to float32 for PyTorch
    y = y.to_numpy()
    return X, y

# Define MAML model (you can customize the architecture as needed)
class MAML(torch.nn.Module):
    def __init__(self, input_size, output_size, meta_batch_size, num_inner_steps):
        super(MAML, self).__init__()
        self.meta_batch_size = meta_batch_size
        self.num_inner_steps = num_inner_steps
        self.model = nn.Sequential(
            Flatten(),
            Linear(input_size, 64),
            ReLU(),
            Linear(64, 32),
            ReLU(),
            Linear(32, output_size)
        )

    def forward(self, support_x, support_y, query_x, query_y):
        inner_losses = []
        for i in range(self.meta_batch_size):
            # Create a copy of the model by copying its state_dict
            inner_model = MAML(input_size=support_x[i].size(1), output_size=2, meta_batch_size=1, num_inner_steps=5)
            inner_model.load_state_dict(self.state_dict())  # Copy current model parameters

            inner_optimizer = Adam(inner_model.parameters(), lr=0.01)

            for _ in range(self.num_inner_steps):
                # Apply forward pass with support data
                inner_output = inner_model.model(support_x[i])  # Use inner model's architecture
                inner_loss = torch.nn.functional.cross_entropy(inner_output, support_y[i])  # Compute loss
                inner_optimizer.zero_grad()
                inner_loss.backward()
                inner_optimizer.step()

            query_output = inner_model.model(query_x[i])  # Forward pass on query
            inner_losses.append(torch.nn.functional.cross_entropy(query_output, query_y[i]))  # Store the query loss

        return torch.mean(torch.stack(inner_losses))  # Return the mean loss over all tasks

# Load breast cancer dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
columns = ['ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean', 
           'Smoothness_mean', 'Compactness_mean', 'Concavity_mean', 'Concave_points_mean', 
           'Symmetry_mean', 'Fractal_dimension_mean', 'Radius_se', 'Texture_se', 'Perimeter_se', 
           'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se', 'Concave_points_se', 
           'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst', 'Texture_worst', 'Perimeter_worst', 
           'Area_worst', 'Smoothness_worst', 'Compactness_worst', 'Concavity_worst', 
           'Concave_points_worst', 'Symmetry_worst', 'Fractal_dimension_worst']
X, y = load_breast_cancer_dataset(url, columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define dataset class for few-shot learning
# class FewShotDataset(torch.utils.data.Dataset):
#     def __init__(self, X, y, num_classes_per_task, num_samples_per_class):
#         self.X = X
#         self.y = y
#         self.num_classes_per_task = num_classes_per_task
#         self.num_samples_per_class = num_samples_per_class

#     def __len__(self):
#         return int(len(self.X) / (self.num_classes_per_task * self.num_samples_per_class))
#     class FewShotDataset(torch.utils.data.Dataset):
#     def __init__(self, X, y, num_classes_per_task, num_samples_per_class):
#         self.X = X
#         self.y = y
#         self.num_classes_per_task = num_classes_per_task
#         self.num_samples_per_class = num_samples_per_class

#     def __len__(self):
#         return int(len(self.X) / (self.num_classes_per_task * self.num_samples_per_class))

#     def __getitem__(self, idx):
#         start_idx = idx * self.num_classes_per_task * self.num_samples_per_class
#         end_idx = start_idx + self.num_classes_per_task * self.num_samples_per_class

#         task_data = self.X[start_idx:end_idx]
#         task_labels = self.y[start_idx:end_idx]

#         # Ensure the split between support and query sets
#         total_samples = len(task_data)
#         support_size = int(0.7 * total_samples)  # Use 70% for support and 30% for query

#         support_indices = np.random.choice(range(total_samples), support_size, replace=False)
#         query_indices = np.setdiff1d(range(total_samples), support_indices)

#         support_x = task_data[support_indices]
#         support_y = task_labels[support_indices]
#         query_x = task_data[query_indices]
#         query_y = task_labels[query_indices]

        # return support_x, support_y, query_x, query_y
# Define dataset class for few-shot learning
class FewShotDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, num_classes_per_task, num_samples_per_class):
        self.X = X
        self.y = y
        self.num_classes_per_task = num_classes_per_task
        self.num_samples_per_class = num_samples_per_class

    def __len__(self):
        return int(len(self.X) / (self.num_classes_per_task * self.num_samples_per_class))

    def __getitem__(self, idx):
        start_idx = idx * self.num_classes_per_task * self.num_samples_per_class
        end_idx = start_idx + self.num_classes_per_task * self.num_samples_per_class

        task_data = self.X[start_idx:end_idx]
        task_labels = self.y[start_idx:end_idx]

        # Ensure the split between support and query sets
        total_samples = len(task_data)
        support_size = int(0.7 * total_samples)  # Use 70% for support and 30% for query

        support_indices = np.random.choice(range(total_samples), support_size, replace=False)
        query_indices = np.setdiff1d(range(total_samples), support_indices)

        support_x = task_data[support_indices]
        support_y = task_labels[support_indices]
        query_x = task_data[query_indices]
        query_y = task_labels[query_indices]

        return support_x, support_y, query_x, query_y


    # def __getitem__(self, idx):
    #     start_idx = idx * self.num_classes_per_task * self.num_samples_per_class
    #     end_idx = start_idx + self.num_classes_per_task * self.num_samples_per_class

    #     task_data = self.X[start_idx:end_idx]
    #     task_labels = self.y[start_idx:end_idx]

    #     # Randomly sample support and query sets
    #     support_indices = np.random.choice(range(len(task_data)), self.num_samples_per_class * self.num_classes_per_task, replace=False)
    #     query_indices = np.setdiff1d(range(len(task_data)), support_indices)

    #     support_x = task_data[support_indices]
    #     support_y = task_labels[support_indices]
    #     query_x = task_data[query_indices]
    #     query_y = task_labels[query_indices]

    #     return support_x, support_y, query_x, query_y

# Create dataloader for few-shot learning
num_classes_per_task = 5
num_samples_per_class = 1
few_shot_dataloader = DataLoader(FewShotDataset(X_train, y_train, num_classes_per_task, num_samples_per_class), batch_size=1)

# Create MAML model and optimizer
maml_model = MAML(input_size=X.shape[1], output_size=2, meta_batch_size=1, num_inner_steps=5)
maml_optimizer = Adam(maml_model.parameters(), lr=0.001)
# # Meta-training loop
# num_epochs = 10  # Adjust as needed
# for epoch in range(num_epochs):
#     for support_x, support_y, query_x, query_y in few_shot_dataloader:
#         maml_optimizer.zero_grad()  # Reset gradients
#         meta_loss = maml_model(support_x, support_y, query_x, query_y)  # Forward pass
#         meta_loss.backward()  # Backpropagation to compute gradients
#         maml_optimizer.step()  # Update model parameters


# # Evaluate on few-shot tasks
# # Evaluate on few-shot tasks
# for support_x, support_y, query_x, query_y in few_shot_dataloader:
#     with torch.no_grad():  # Use no_grad only during evaluation
#         # Forward pass through the MAML model
#         query_output = maml_model.model(query_x[0])  # Get predictions

#         # Calculate accuracy
#         predictions = query_output.argmax(dim=1)  # Get predicted classes
#         accuracy = (predictions == query_y[0]).float().mean()  # Calculate accuracy
#         print("Accuracy:", accuracy.item())


# Meta-training loop
num_epochs = 10  # Adjust as needed
for epoch in range(num_epochs):
    for support_x, support_y, query_x, query_y in few_shot_dataloader:
        print(f"Support Y: {support_y}")
        print(f"Query Y: {query_y}")
        
        maml_optimizer.zero_grad()  # Reset gradients
        meta_loss = maml_model(support_x, support_y, query_x, query_y)  # Forward pass
        
        print(f"Meta Loss: {meta_loss.item()}")
        
        if torch.isnan(meta_loss):  # Check if loss becomes NaN
            print("Loss is NaN. Stopping.")
            break
        
        meta_loss.backward()  # Backpropagation to compute gradients
        maml_optimizer.step()  # Update model parameters

# Evaluate on few-shot tasks
for support_x, support_y, query_x, query_y in few_shot_dataloader:
    with torch.no_grad():  # Use no_grad only during evaluation
        query_output = maml_model.model(query_x[0])  # Get predictions
        predictions = query_output.argmax(dim=1)  # Get predicted classes
        print(f"Predictions: {predictions}")
        print(f"True Query Y: {query_y[0]}")
        accuracy = (predictions == query_y[0]).float().mean()  # Calculate accuracy
        print("Accuracy:", accuracy.item())



Support Y: tensor([[0, 0, 0]])
Query Y: tensor([[0, 0]])
Meta Loss: 0.0
Support Y: tensor([[0, 0, 1]])
Query Y: tensor([[0, 1]])
Meta Loss: 9.184491157531738
Support Y: tensor([[0, 1, 0]])
Query Y: tensor([[1, 0]])
Meta Loss: 3.008078098297119
Support Y: tensor([[0, 1, 1]])
Query Y: tensor([[0, 0]])
Meta Loss: 23.591876983642578
Support Y: tensor([[1, 0, 0]])
Query Y: tensor([[0, 0]])
Meta Loss: 3.1590361686539836e-06
Support Y: tensor([[0, 1, 1]])
Query Y: tensor([[0, 1]])
Meta Loss: 19.193586349487305
Support Y: tensor([[1, 0, 1]])
Query Y: tensor([[1, 0]])
Meta Loss: 3.082970142364502
Support Y: tensor([[1, 1, 0]])
Query Y: tensor([[0, 0]])
Meta Loss: 24.574447631835938
Support Y: tensor([[1, 0, 1]])
Query Y: tensor([[0, 0]])
Meta Loss: 14.253547668457031
Support Y: tensor([[1, 0, 1]])
Query Y: tensor([[0, 0]])
Meta Loss: 43.76853561401367
Support Y: tensor([[0, 0, 0]])
Query Y: tensor([[0, 0]])
Meta Loss: 0.0
Support Y: tensor([[0, 1, 0]])
Query Y: tensor([[0, 0]])
Meta Loss: 4.768

## multi modal learning

In [31]:
import os
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Disable the symlink warning
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load the image from your local directory
image_path = "cat.jpg"  # Update this with the correct path if needed
text_description = "A cute cat sitting on a table."  # Example text description

# Load image
image = Image.open(image_path)

# Initialize CLIP processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# Process inputs
inputs = processor(text=text_description, images=image, return_tensors="pt", padding=True)

# Forward pass through the model
outputs = model(**inputs)

# Extract logits and compute probabilities
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Print the probabilities
print("Probabilities:", probs)


Probabilities: tensor([[1.]], grad_fn=<SoftmaxBackward0>)


## self improvement

In [3]:
import pandas as pd
from tpot import TPOTClassifier  # or TPOTRegressor for regression tasks
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load breast cancer dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
columns = ['ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean', 
           'Smoothness_mean', 'Compactness_mean', 'Concavity_mean', 'Concave_points_mean', 
           'Symmetry_mean', 'Fractal_dimension_mean', 'Radius_se', 'Texture_se', 'Perimeter_se', 
           'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se', 'Concave_points_se', 
           'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst', 'Texture_worst', 'Perimeter_worst', 
           'Area_worst', 'Smoothness_worst', 'Compactness_worst', 'Concavity_worst', 
           'Concave_points_worst', 'Symmetry_worst', 'Fractal_dimension_worst']

# Read the dataset
data = pd.read_csv(url, header=None, names=columns)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})  # Convert Diagnosis to binary

# Split data into features and target
X = data.drop(columns=['ID', 'Diagnosis']).values
y = data['Diagnosis'].values

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define AutoML system
class AutoMLSystem:
    def __init__(self):
        self.model_performance_history = {}

    def evaluate_model(self, model, X_test, y_test):
        y_pred = model.predict(X_test)  # Make sure your model has a predict method
        accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
        return accuracy

    def update_model_selection(self, model_name, performance):
        if model_name in self.model_performance_history:
            self.model_performance_history[model_name].append(performance)
        else:
            self.model_performance_history[model_name] = [performance]

# Initialize TPOT with 10 generations
tpot = TPOTClassifier(generations=10, population_size=20, random_state=42)

# Fit TPOT model
tpot.fit(X_train, y_train)

# Initialize AutoML system
automl_system = AutoMLSystem()

# Evaluate the TPOT model
model_performance = automl_system.evaluate_model(tpot, X_test, y_test)
automl_system.update_model_selection("TPOT", model_performance)

# Print the model performance history
print("Model Performance History:", automl_system.model_performance_history)


Model Performance History: {'TPOT': [0.9824561403508771]}


## Conclusion:- In this project, I implemented an AutoML system utilizing TPOT to optimize the classification of breast cancer using the Wisconsin dataset. By evaluating different models over ten generations, I aimed to identify the most effective algorithm for accurate predictions. This approach highlights the potential of automated machine learning in enhancing model selection and performance in real-world applications.