<a href="https://colab.research.google.com/github/mikakia/Fertility/blob/main/Fertility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import  missingno as msno

import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier




#Exploring dataset - Preprocessing

In [None]:
url = "https://raw.githubusercontent.com/mikakia/Fertility/main/fertilitydataset.txt"
df = pd.read_csv(url, sep=",",header=None,
                 names=["season", "age", "childish_diseases", "accident_trauma","sergical_intervention",
                        "fever","alcohol_freq","smoking","hours_sitting","diagnosis"]
                 )
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.corr(numeric_only=True)

In [None]:
#Pairplots
numeric_cols = ["season", "age", "childish_diseases", "accident_trauma","sergical_intervention","fever","alcohol_freq","smoking","hours_sitting"]

sns.pairplot(df, vars=numeric_cols, hue='diagnosis', diag_kind='hist', palette='Set1')
plt.show()

In [None]:
# 0-> Normal (N), 1-> Altered(O)
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])
df.head()

In [None]:
df['diagnosis'].value_counts()

#Training and Test Phase

In [None]:
X = df.drop('diagnosis', axis=1)  # all features
y = df['diagnosis']               # target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
#smooth imbalanced data with SMOT
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

##Logistic Regression
accuracy: 65%

In [None]:
log_reg = LogisticRegression(class_weight='balanced',random_state=42) #using class_weight due to imbalanced data
log_reg.fit(X_train, y_train)

In [None]:
y_pred_log = log_reg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

##Decision Tree
Accuracy: 85%

In [None]:
des_tree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=42)
des_tree.fit(X_train_res, y_train_res)


In [None]:
y_pred_tree = des_tree.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Classification Report:")
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))

In [None]:
# 20 predictions
for true, y_pred_tree in zip(y_test[:20], y_pred_tree[:20]):
    print(f"True: {true} → Predicted: {y_pred_tree}")

##Random Forest
Accuracy: 85%

In [None]:
rand_fo = RandomForestClassifier(random_state=42)
rand_fo.fit(X_train_res, y_train_res)

In [None]:
y_pred_rf = rand_fo.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

##Gradient Bossting
Accuracy: 85%

In [None]:
gb_model = GradientBoostingClassifier(n_estimators=40,learning_rate=0.05,max_depth=3,random_state=42)
gb_model.fit(X_train_res, y_train_res)

In [None]:
y_pred_gb = gb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

##Neural Network
Accuracy: 75%

In [None]:

nn_model = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', max_iter=300, random_state=42)
nn_model.fit(X_train_res, y_train_res)

In [None]:
y_pred_nn = nn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Classification Report:\n", classification_report(y_test, y_pred_nn))
print("Confusion Matric\n",confusion_matrix(y_test, y_pred_nn))

##KNN
Accuracy: 80%

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=4, metric='manhattan')
knn_model.fit(X_train_res, y_train_res)

In [None]:
y_pred_knn = knn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matric\n",confusion_matrix(y_test, y_pred_knn))

###KNN with kfolds
Accuracy: 88%

In [None]:
from sklearn.model_selection import KFold, cross_val_score,cross_val_predict
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
scores = cross_val_score(knn_model, X, y, cv=kf, scoring='accuracy')

print("Accuracy for each fold:", scores)
print("Mean accuracy:", scores.mean())