# Install Requirements

In [None]:
!pip install numpy
!pip install scikit-learn

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your features and y is your labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# K-Fold

In [7]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load a sample dataset for demonstration (Iris dataset)
data = load_iris()
X, y = data.data, data.target

# Number of folds
k = 5

# Create KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize an array to store accuracy for each fold
accuracies = []

# K-Fold Cross-Validation
for train_index, test_index in kf.split(X):
    # Split data into training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and train the model
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(accuracy)
# Calculate average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {average_accuracy}")


1.0
0.9666666666666667
0.9333333333333333
0.9333333333333333
0.9333333333333333
Average Accuracy: 0.9533333333333335


In [5]:
kf

KFold(n_splits=5, random_state=42, shuffle=True)

# Classification

## Binary Classification


In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Binary Classification Accuracy:", accuracy_score(y_test, predictions))


Binary Classification Accuracy: 0.9649122807017544


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Multiclass Classification

In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Multiclass Classification Accuracy:", accuracy_score(y_test, predictions))


Multiclass Classification Accuracy: 1.0


## Multilabel Classification

This strategy consists of fitting one classifier per target. This is a simple strategy for extending classifiers that do not natively support multi-target classification.

In [12]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Generate synthetic data
X, y = make_multilabel_classification(n_samples=1000, n_features=20, n_classes=3, n_labels=2, random_state=42)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a MultiOutputClassifier with DecisionTreeClassifier
model = MultiOutputClassifier(DecisionTreeClassifier())
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Multilabel Classification Accuracy:", accuracy_score(y_test, predictions))


Multilabel Classification Accuracy: 0.405


## Multiclass-Multilabel Classification



In [14]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate a synthetic multilabel dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=20, n_classes=5, n_labels=3, random_state=42)

# Each label can belong to multiple classes, hence multiclass-multilabel
# Here, 'y' will be a 2D array where each row has multiple labels, and each label can have multiple classes.

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use MultiOutputClassifier with RandomForestClassifier
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))

# Train model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model - Using accuracy here for simplicity, but in real scenarios, consider more appropriate metrics
accuracy = accuracy_score(y_test, predictions)
print(f"Multiclass-Multilabel Classification Accuracy: {accuracy}")


Multiclass-Multilabel Classification Accuracy: 0.35


## Hierarchical Classification Example


# on-hot-encoding

In [1]:
pip install pandas



In [2]:
import pandas as pd

# Example dataset
data = {'color': ['blue', 'green', 'red', 'green']}
df = pd.DataFrame(data)

# Applying one-hot encoding
encoded_df = pd.get_dummies(df, columns=['color'])

print(encoded_df)

   color_blue  color_green  color_red
0           1            0          0
1           0            1          0
2           0            0          1
3           0            1          0


## KNNImputer

In [3]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

# Example data with missing values
data = {
    'Age': [25, np.nan, 27, 29, 30],
    'Salary': [50000, 55000, np.nan, 60000, 65000]
}
df = pd.DataFrame(data)

# Create KNNImputer instance
imputer = KNNImputer(n_neighbors=2)

# Fit the imputer and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the imputed data back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print(df_imputed)


    Age   Salary
0  25.0  50000.0
1  27.0  55000.0
2  27.0  55000.0
3  29.0  60000.0
4  30.0  65000.0
