# Q1

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()

# Split the dataset into features (X) and target variable (y)
X = data.data
y = data.target

feature_names = data.feature_names


# Get data types of features
feature_types = [np.issubdtype(X[:, i].dtype, np.number) for i in range(X.shape[1])]

# Select numerical features
numerical_features = [feature_names[i] for i, is_numerical in enumerate(feature_types) if is_numerical]

# Select categorical features
categorical_features = [feature_names[i] for i, is_numerical in enumerate(feature_types) if not is_numerical]

# Define the feature selection step
feature_selector = SelectKBest()

# Define the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# Create the final pipeline with feature selection and classifier
pipeline = Pipeline([
    ('feature_selection', feature_selector),
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
"""
Possible improvements for the pipeline:

Explore different feature selection methods and choose the most appropriate one for your dataset.
Try different imputation strategies and evaluate their impact on the model's performance.
Experiment with different preprocessing techniques such as different scaling methods or encoding strategies.
Consider adding more advanced techniques like feature engineering or dimensionality reduction to improve model performance.
"""

# Q2

In [7]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X, y = data.data, data.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define individual classifiers
rf_clf = RandomForestClassifier()
lr_clf = LogisticRegression()

# Create the voting classifier pipeline
voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('lr', lr_clf)],
    voting='hard'
)

# Train the pipeline
voting_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = voting_clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
