<a href="https://colab.research.google.com/github/mohsina680/BuildablesDataScienceFellowship/blob/main/Task12_DS_FellowShip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Task 12 - Classification Algorithms II

# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# 1. Employee Attrition - Logistic Regression

hr = pd.read_csv("ibm-hr-analytics-attrition.csv")
hr = hr.fillna(hr.mean(numeric_only=True))

for col in hr.select_dtypes(include="object"):
    hr[col] = LabelEncoder().fit_transform(hr[col])

X = hr.drop("Attrition", axis=1)
y = hr["Attrition"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Employee Attrition - Logistic Regression")
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

Employee Attrition - Logistic Regression
Precision: 0.6842105263157895
Recall: 0.3333333333333333
F1: 0.4482758620689655


In [25]:

# 2. Heart Disease - KNN

heart = pd.read_csv("heart.csv")
heart = heart.fillna(heart.mean(numeric_only=True))
for col in heart.select_dtypes(include="object"):
    heart[col] = LabelEncoder().fit_transform(heart[col])

X = heart.drop("target", axis=1)
y = heart["target"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Heart Disease - KNN")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))


Heart Disease - KNN
Accuracy: 0.8341463414634146
ROC-AUC: 0.8338568437083572


In [26]:
# 3. Hospital Readmission - Logistic Regression

hosp = pd.read_csv("hospital_readmissions.csv")
for col in hosp.select_dtypes(include="object"):
    hosp[col] = hosp[col].fillna(hosp[col].mode()[0])
    hosp[col] = LabelEncoder().fit_transform(hosp[col])

X = hosp.drop("readmitted", axis=1)
y = hosp["readmitted"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Hospital Readmission - Logistic Regression")
print("Precision:", precision_score(y_test, y_pred, average="macro"))
print("Recall:", recall_score(y_test, y_pred, average="macro"))
print("F1:", f1_score(y_test, y_pred, average="macro"))



Hospital Readmission - Logistic Regression
Precision: 0.6146121547497694
Recall: 0.5970995187818995
F1: 0.5875319652172504


In [27]:
# 4. Credit Card Fraud - Decision Tree

fraud = pd.read_csv("creditcard.csv")
fraud = fraud.fillna(fraud.mean(numeric_only=True))

X = fraud.drop("Class", axis=1)
y = fraud["Class"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Credit Card Fraud - Decision Tree")
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Credit Card Fraud - Decision Tree
ROC-AUC: 0.9079258567523802
Confusion Matrix:
 [[56837    27]
 [   18    80]]


In [28]:
# 5. Wine Quality - Decision Tree

# Read with semicolon delimiter
wine = pd.read_csv("winequality-red.csv", sep=";")
wine = wine.fillna(wine.mean(numeric_only=True))

X = wine.drop("quality", axis=1)
y = wine["quality"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Wine Quality - Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test), multi_class="ovr"))



Wine Quality - Decision Tree
Accuracy: 0.5625
ROC-AUC: 0.5937578399776532


In [29]:
# 6. SMS Spam - Naive Bayes


sms = pd.read_csv("spam.csv", encoding="latin-1")
sms = sms.rename(columns={sms.columns[0]: "label", sms.columns[1]: "message"})
sms = sms[["label", "message"]]

sms["label"] = LabelEncoder().fit_transform(sms["label"])

X = sms["message"]
y = sms["label"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("SMS Spam - Naive Bayes")
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))

SMS Spam - Naive Bayes
Precision: 1.0
Recall: 0.72
F1: 0.8372093023255814
ROC-AUC: 0.86


In [30]:
# 7. Diabetes - Random Forest


diab = pd.read_csv("diabetes.csv")
diab = diab.fillna(diab.mean(numeric_only=True))

X = diab.drop("Outcome", axis=1)
y = diab["Outcome"]

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Diabetes - Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("Feature Importances:", model.feature_importances_)


Diabetes - Random Forest
Accuracy: 0.7337662337662337
ROC-AUC: 0.7080808080808081
Feature Importances: [0.08357648 0.25799424 0.08965987 0.06918187 0.07747954 0.15991547
 0.12011654 0.142076  ]


In [31]:
# 8. Iris - SVM (Kaggle version)

iris = pd.read_csv("iris.csv")
iris = iris.fillna(iris.mean(numeric_only=True))

X = iris.drop("Species", axis=1)
y = iris["Species"]

X = StandardScaler().fit_transform(X)
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Iris - SVM")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro"))
print("Recall:", recall_score(y_test, y_pred, average="macro"))
print("F1:", f1_score(y_test, y_pred, average="macro"))


Iris - SVM
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0


In [32]:
# 9. Breast Cancer - KNN (Kaggle version)

cancer = pd.read_csv("breast_cancer.csv")

print(cancer.shape)
print(cancer.columns)


# Drop unnecessary columns
cancer = cancer.drop(["id", "Unnamed: 32"], axis=1)

# Features and target
X = cancer.drop("diagnosis", axis=1)
y = cancer["diagnosis"]

# Encode labels (M = malignant, B = benign)
y = LabelEncoder().fit_transform(y)

# Scale features
X = StandardScaler().fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Results
print("Breast Cancer - KNN")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


(569, 33)
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
Breast Cancer - KNN
Accuracy: 0.9473684210526315
Confusion Matrix:
 [[68  3]
 [ 3 40]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95      