# Talha Mahmood Awan (21B-126-SE)

# TASK 1

# SECTION 2

In [None]:
# Importing necessary libraries and functions
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from matplotlib import pyplot

# Function to generate a synthetic classification dataset
def get_dataset():
    X, y = make_classification(n_samples=5000, n_features=20, n_informative=10, n_redundant=10, random_state=1)
    return X, y

# Function to define and return a list of classifier models
def get_models():
    models = []
    models.append(('lr', LogisticRegression()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('tree', DecisionTreeClassifier()))
    models.append(('nb', GaussianNB()))
    models.append(('svm', SVC(probability=True)))
    return models

# Function to evaluate a single model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# Generating dataset and models
X, y = get_dataset()
models = get_models()

# Evaluating each model and printing mean accuracy and standard deviation
results, names = [], []
for name, model in models:
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('> %s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# Visualizing the performance of models using boxplots
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

# Creating a voting ensemble of classifiers
ensemble = VotingClassifier(estimators=models, voting='soft')

# Evaluating the ensemble model using cross-validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(ensemble, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Printing mean accuracy and standard deviation of the ensemble
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

# SECTION 3

In [None]:
# Function to evaluate an ensemble of models
def evaluate_ensemble(models, X, y):
    if len(models) == 0:
        return 0.0
    ensemble = VotingClassifier(estimators=models, voting='soft')
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(ensemble, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return mean(scores)

# Function to prune models from the ensemble
def prune_round(models_in, X, y):
    baseline = evaluate_ensemble(models_in, X, y)
    best_score, removed = baseline, None
    for m in models_in:
        dup = models_in.copy()
        dup.remove(m)
        result = evaluate_ensemble(dup, X, y)
        if result > best_score:
            best_score, removed = result, m
    return best_score, removed

# Function to iteratively prune models from the ensemble until no further improvement
def prune_ensemble(models, X, y):
    best_score = 0.0
    while True:
        score, removed = prune_round(models, X, y)
        if removed is None:
            print('>no further improvement')
            break
        best_score = score
        models.remove(removed)
        print('>%.3f (removed: %s)' % (score, removed[0]))
    return best_score, models

# Getting dataset and models
X, y = get_dataset()
models = get_models()

# Pruning the ensemble and printing the final models and their mean accuracy
score, model_list = prune_ensemble(models, X, y)
names = ','.join([n for n, _ in model_list])
print('Models: %s' % names)
print('Final Mean Accuracy: %.3f' % score)


# SECTION 4

In [None]:
# Function to add models to the ensemble
def grow_round(models_in, models_candidate, X, y):
    baseline = evaluate_ensemble(models_in, X, y)
    best_score, addition = baseline, None
    for m in models_candidate:
        dup = models_in.copy()
        dup.append(m)
        result = evaluate_ensemble(dup, X, y)
        if result > best_score:
            best_score, addition = result, m
    return best_score, addition

# Function to iteratively add models to the ensemble until no further improvement
def grow_ensemble(models, X, y):
    best_score, best_list = 0.0, list()
    while True:
        score, addition = grow_round(best_list, models, X, y)
        if addition is None:
            print('>no further improvement')
            break
        best_score = score
        models.remove(addition)
        best_list.append(addition)
        names = ','.join([n for n, _ in best_list])
        print('> %.3f (%s)' % (score, names))
    return best_score, best_list

# Growing the ensemble and printing the best ensemble and its score
best_score, best_list = grow_ensemble(models, X, y)
print("Best ensemble:", best_list)
print("Best score:", best_score)

# TASK 2

In [None]:
# Importing necessary libraries
from numpy import mean, std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from matplotlib import pyplot

# Function to get the dataset
def get_dataset():
    X, y = make_classification(n_samples=5000, n_features=20, n_informative=10, n_redundant=10, random_state=1)
    return X, y

# Function to get a list of models
def get_models():
    models = list()
    models.append(('lr', LogisticRegression()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('tree', DecisionTreeClassifier()))
    models.append(('nb', GaussianNB()))
    models.append(('svm', SVC(probability=True)))
    return models

# Function to evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# Getting the dataset
X, y = get_dataset()

# Getting the models to evaluate
models = get_models()

# Dictionary to store mean accuracy scores of each model
model_scores = {}

# Evaluating the models and storing results
for name, model in models:
    scores = evaluate_model(model, X, y)
    mean_accuracy = mean(scores)
    std_accuracy = std(scores)
    model_scores[name] = (mean_accuracy, std_accuracy)
    print('>%s Mean Accuracy: %.3f (%.3f)' % (name, mean_accuracy, std_accuracy))

# Selecting the best model based on mean accuracy
best_model_name = max(model_scores, key=lambda x: model_scores[x][0])
best_model_mean_accuracy, best_model_std_accuracy = model_scores[best_model_name]

print("\nBest Model:")
print("Model:", best_model_name)
print("Mean Accuracy:", best_model_mean_accuracy)
print("Standard Deviation of Accuracy:", best_model_std_accuracy)