In [48]:
import pandas as pd
import os

# Assuming 'Iris.csv' is in the same directory as the script
# If not, replace with the correct path
iris_path = '/content/Iris.csv'  # Or the full path if it's elsewhere

# Load the Iris dataset
try:
    iris_df = pd.read_csv(iris_path)
    # Display basic information and a preview of the dataset
    iris_info = iris_df.info()
    iris_head = iris_df.head()
    iris_info, iris_head
except FileNotFoundError:
    print(f"Error: File not found at {iris_path}. Please check the file path.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [49]:
print(iris_info)
print(iris_head)


None
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


# **Data Preprocessing**
Drop non-informative columns like Id.
Encode categorical variables (the Species column) if not numeric.
Scale the features using standardization for better performance in clustering and classification.

In [50]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
iris_df = pd.read_csv('/content/Iris.csv')

# Drop Id column
iris_df = iris_df.drop(columns=['Id'])

# Encode target column 'Species'
label_encoder = LabelEncoder()
iris_df['Species'] = label_encoder.fit_transform(iris_df['Species'])

# Separate features and target variable
X = iris_df.drop(columns=['Species'])
y = iris_df['Species']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [51]:
# Print labels dictionary
labels_dict = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Labels dictionary:", labels_dict)


Labels dictionary: {0: 'Iris-setosa', 1: 'Iris-versicolor', 2: 'Iris-virginica'}


# **Apply Unsupervised Learning Techniques (Clustering)**
Use K-means and Hierarchical Clustering. Then validate using Silhouette Score and Davies-Bouldin Index.

In [52]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# K-means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(X_scaled)

# Validation metrics
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
kmeans_davies_bouldin = davies_bouldin_score(X_scaled, kmeans_labels)
hierarchical_silhouette = silhouette_score(X_scaled, hierarchical_labels)
hierarchical_davies_bouldin = davies_bouldin_score(X_scaled, hierarchical_labels)

print("KMeans Silhouette Score:", kmeans_silhouette)
print("KMeans Davies-Bouldin Index:", kmeans_davies_bouldin)
print("Hierarchical Silhouette Score:", hierarchical_silhouette)
print("Hierarchical Davies-Bouldin Index:", hierarchical_davies_bouldin)


KMeans Silhouette Score: 0.4787241921049546
KMeans Davies-Bouldin Index: 0.7868006762339902
Hierarchical Silhouette Score: 0.44553956399200406
Hierarchical Davies-Bouldin Index: 0.805940469032997


# **Feature Selection**
You can use Variance Threshold or Recursive Feature Elimination (RFE) to select important features.

In [53]:
from sklearn.feature_selection import VarianceThreshold

# Apply Variance Threshold to remove low-variance features
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X_scaled)


# **Apply Supervised Learning Classifiers**
Choose five classifiers, potentially using boosting algorithms.
"Logistic Regression"
"Random Forest"
"Gradient Boosting"
"AdaBoost"
"K-Nearest Neighbors"

In [54]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"Results for {name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")

# Select the best model based on accuracy
best_model_name = max(results, key=results.get)
best_model = classifiers[best_model_name]
print(f"Best model: {best_model_name} with accuracy {results[best_model_name]:.2f}")

# Save the best model to a file
joblib.dump(best_model, 'model.pkl')
print("Model saved as model.pkl")

Results for Logistic Regression:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



Results for Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg   

## **Titanic Survival Datasets  **                                  
Principal Component Analysis (PCA):                                                                          Import PCA:

1. Import PCA:

In [55]:
from sklearn.decomposition import PCA


2. Fit PCA:

Retain components that explain about 95% of the variance:

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Titanic dataset - replace with your actual file path
titanic_data = pd.read_csv('/content/titanic.csv')  # Replace with the correct path

# Split the data into training and test sets
X = titanic_data.drop(columns=['Survived'])
y = titanic_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
#Identify non numeric columns
print(X_train.dtypes)


PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [58]:
#Remove Non-Numeric Columns
X = X.drop(columns=['Cabin'], errors='ignore')


In [59]:
# Reprocess missing values for numeric features only
numeric_features = X.select_dtypes(include=['number']).columns
X[numeric_features] = X[numeric_features].fillna(X[numeric_features].median())

In [60]:
#Proceed with Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [61]:
#Cleaned Preprocessing Code:
# Drop irrelevant columns and handle missing values
X = titanic_data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'], errors='ignore')
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
X['Embarked'] = X['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

# Fill missing values with median
X.fillna(X.median(), inplace=True)

# Split the data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


In [63]:
from sklearn.linear_model import LogisticRegression


In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression
logreg_pca = LogisticRegression(max_iter=1000, random_state=42)
logreg_pca.fit(X_train_pca, y_train)

# Predictions and Accuracy
y_pred_pca_logreg = logreg_pca.predict(X_test_pca)
accuracy_pca_logreg = accuracy_score(y_test, y_pred_pca_logreg)

print("Logistic Regression with PCA Accuracy:", accuracy_pca_logreg)


Logistic Regression with PCA Accuracy: 0.6071428571428571


The Logistic Regression with PCA Accuracy is 60.71%, which is significantly lower than the accuracy achieved with manual feature engineering (100%).     

Recursive Feature Elimination (RFE)
Let's apply Recursive Feature Elimination (RFE) to see if it can identify a subset of features that improves accuracy. I will now proceed to implement RFE.

In [65]:
from sklearn.feature_selection import RFE

# Initialize logistic regression as the base estimator for RFE
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=5)

# Fit RFE on the training data
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train Logistic Regression on RFE-selected features
logreg_rfe = LogisticRegression(max_iter=1000, random_state=42)
logreg_rfe.fit(X_train_rfe, y_train)

# Predictions and Accuracy
y_pred_rfe_logreg = logreg_rfe.predict(X_test_rfe)
accuracy_rfe_logreg = accuracy_score(y_test, y_pred_rfe_logreg)

accuracy_rfe_logreg


1.0

Recursive Feature Elimination (RFE)

In [66]:
from sklearn.feature_selection import RFE


In [67]:
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)


In [68]:
# Import necessary libraries
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X and y are already defined (your features and target variables)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply RFE with Logistic Regression as estimator to rank features
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train and evaluate Logistic Regression
logreg_rfe.fit(X_train_rfe, y_train)
y_pred_rfe_logreg = logreg_rfe.predict(X_test_rfe)
accuracy_rfe_logreg = accuracy_score(y_test, y_pred_rfe_logreg)
print(f"Logistic Regression Accuracy with RFE: {accuracy_rfe_logreg:.4f}")

# Train and evaluate Decision Tree
tree_rfe = DecisionTreeClassifier(random_state=42)
tree_rfe.fit(X_train_rfe, y_train)
y_pred_rfe_tree = tree_rfe.predict(X_test_rfe)
accuracy_rfe_tree = accuracy_score(y_test, y_pred_rfe_tree)
print(f"Decision Tree Accuracy with RFE: {accuracy_rfe_tree:.4f}")


Logistic Regression Accuracy with RFE: 1.0000
Decision Tree Accuracy with RFE: 1.0000
