# Install Requirements

In [None]:
!pip install numpy
!pip install scikit-learn

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your features and y is your labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# K-Fold

In [7]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load a sample dataset for demonstration (Iris dataset)
data = load_iris()
X, y = data.data, data.target

# Number of folds
k = 5

# Create KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize an array to store accuracy for each fold
accuracies = []

# K-Fold Cross-Validation
for train_index, test_index in kf.split(X):
    # Split data into training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and train the model
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(accuracy)
# Calculate average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {average_accuracy}")


1.0
0.9666666666666667
0.9333333333333333
0.9333333333333333
0.9333333333333333
Average Accuracy: 0.9533333333333335


In [5]:
kf

KFold(n_splits=5, random_state=42, shuffle=True)

# Classification

## Binary Classification


In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Binary Classification Accuracy:", accuracy_score(y_test, predictions))


Binary Classification Accuracy: 0.9649122807017544


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Multiclass Classification

In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Multiclass Classification Accuracy:", accuracy_score(y_test, predictions))


Multiclass Classification Accuracy: 1.0


## Multilabel Classification

In [12]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Generate synthetic data
X, y = make_multilabel_classification(n_samples=1000, n_features=20, n_classes=3, n_labels=2, random_state=42)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a MultiOutputClassifier with DecisionTreeClassifier
model = MultiOutputClassifier(DecisionTreeClassifier())
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
print("Multilabel Classification Accuracy:", accuracy_score(y_test, predictions))


Multilabel Classification Accuracy: 0.405


## Multiclass-Multilabel Classification



In [14]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate a synthetic multilabel dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=20, n_classes=5, n_labels=3, random_state=42)

# Each label can belong to multiple classes, hence multiclass-multilabel
# Here, 'y' will be a 2D array where each row has multiple labels, and each label can have multiple classes.

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use MultiOutputClassifier with RandomForestClassifier
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))

# Train model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model - Using accuracy here for simplicity, but in real scenarios, consider more appropriate metrics
accuracy = accuracy_score(y_test, predictions)
print(f"Multiclass-Multilabel Classification Accuracy: {accuracy}")


Multiclass-Multilabel Classification Accuracy: 0.35


## Imbalanced Classification

In [15]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Create an imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=42)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to generate synthetic samples for balancing
smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train a RandomForest model on the balanced dataset
model = RandomForestClassifier()
model.fit(X_train_balanced, y_train_balanced)

# Make predictions and evaluate
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00       187

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



## Hierarchical Classification Example


In [16]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Function to generate synthetic data
def generate_data():
    # Level 1 data (Animal vs Plant)
    X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_classes=2, random_state=42)
    # Level 2 data (Mammal vs Bird), only for 'Animal' class (assumed to be labeled as '1')
    X_sub, y_sub = make_classification(n_samples=np.sum(y), n_features=10, n_informative=5, n_classes=2, random_state=42)
    return X, y, X_sub, y_sub

# Generate synthetic dataset
X, y, X_sub, y_sub = generate_data()

# Split Level 1 data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Level 1 classifier (Animal vs Plant)
model_level_1 = RandomForestClassifier()
model_level_1.fit(X_train, y_train)

# Predict Level 1 on test data
predictions_level_1 = model_level_1.predict(X_test)

# Prepare Level 2 data (only for 'Animal' class)
X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split(X_sub, y_sub, test_size=0.3, random_state=42)

# Train Level 2 classifier (Mammal vs Bird)
model_level_2 = RandomForestClassifier()
model_level_2.fit(X_sub_train, y_sub_train)

# Predict Level 2 for samples classified as 'Animal' in Level 1
animal_indices = np.where(predictions_level_1 == 1)  # Assuming '1' is for 'Animal'
predictions_level_2 = np.zeros_like(predictions_level_1)
predictions_level_2[animal_indices] = model_level_2.predict(X_test[animal_indices])

# predictions_level_1 contains 'Animal' vs 'Plant'
# predictions_level_2 contains 'Mammal' vs 'Bird' for 'Animal' class and zeros for 'Plant'

# Note: In a real dataset, you would use actual hierarchical labels and potentially have more levels and categories.


# on-hot-encoding

In [1]:
pip install pandas



In [2]:
import pandas as pd

# Example dataset
data = {'color': ['blue', 'green', 'red', 'green']}
df = pd.DataFrame(data)

# Applying one-hot encoding
encoded_df = pd.get_dummies(df, columns=['color'])

print(encoded_df)

   color_blue  color_green  color_red
0           1            0          0
1           0            1          0
2           0            0          1
3           0            1          0


## KNNImputer

In [3]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

# Example data with missing values
data = {
    'Age': [25, np.nan, 27, 29, 30],
    'Salary': [50000, 55000, np.nan, 60000, 65000]
}
df = pd.DataFrame(data)

# Create KNNImputer instance
imputer = KNNImputer(n_neighbors=2)

# Fit the imputer and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the imputed data back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print(df_imputed)


    Age   Salary
0  25.0  50000.0
1  27.0  55000.0
2  27.0  55000.0
3  29.0  60000.0
4  30.0  65000.0
