In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, precision_recall_curve

# Load the dataset
df_diabetes = pd.read_csv('Data.csv')

# Checking the shape of the dataframe
print("Data Shape:", df_diabetes.shape)

# Display the first few rows of the dataset
print("First 5 Rows:")
print(df_diabetes.head())

# Check data types of columns
print("Data Types:")
print(df_diabetes.dtypes)

# Check for missing values
print("Missing Values:")
print(df_diabetes.isnull().sum())

# Visualize the count of the 'Prediction' column
ax = sns.countplot(df_diabetes['Prediction'])
ax.yaxis.grid()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(False)
ax.spines['left'].set_linewidth(False)
plt.show()

# Remove 'Prediction' column for data preprocessing
df_diabetes1 = df_diabetes.drop('Prediction', axis=1, inplace=False)

# Histograms of the dataset
ax = df_diabetes1.hist(figsize=(10, 10))
plt.show()

# Pairplot to visualize relationships in the dataset
ax = sns.pairplot(df_diabetes, hue='Prediction')

# Define features (X) and target (y)
y = df_diabetes['Prediction']
X = df_diabetes.drop('Prediction', axis=1, inplace=False)

# Standardize features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=['Age', 'Pregnancy No', 'Weight', 'Height', 'BMI', 'Heredity'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Decision Tree model
dectree = DecisionTreeClassifier()
dectree.fit(X_train, y_train)
y_pred_dectree = dectree.predict(X_test)

# K-Nearest Neighbors model
neighbors = np.arange(1, 15)
train_accuracy = np.empty(len(neighbors)
test_accuracy = np.empty(len(neighbors))

for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

# Plot k-NN Accuracy for different numbers of neighbors
plt.title('k-NN Accuracy for different number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

# K-Nearest Neighbors model with k=13
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Random Forest model
ranfor = RandomForestClassifier(n_estimators=1000, random_state=0)
ranfor.fit(X_train, y_train)
y_pred_ranfor = ranfor.predict(X_test)

# AdaBoost model
abc = AdaBoostClassifier(n_estimators=1000)
abc.fit(X_train, y_train)
y_pred_abc = abc.predict(X_test)

# Voting Classifier without weights
vc = VotingClassifier(estimators=[('logreg', logreg), ('dectree', dectree), ('ranfor', ranfor), ('knn', knn), ('abc', abc)], voting='soft')
vc.fit(X_train, y_train)
y_pred_vc = vc.predict(X_test)

# Voting Classifier with weights
vc1 = VotingClassifier(estimators=[('logreg', logreg), ('dectree', dectree), ('ranfor', ranfor), ('knn', knn), ('abc', abc)], voting='soft', weights=[2, 1, 2, 2, 1])
vc1.fit(X_train, y_train)
y_pred_vc1 = vc1.predict(X_test)

# Model Accuracy
print('Model Accuracy')
print('\n')
print('Logistic Regression:', round(accuracy_score(y_test, y_pred_logreg) * 100, 2), '%')
print('Decision Tree:', round(accuracy_score(y_test, y_pred_dectree) * 100, 2), '%')
print('KNN:', round(accuracy_score(y_test, y_pred_knn) * 100, 2), '%')
print('\n')
print('Averaging Method')
print('Random Forest:', round(accuracy_score(y_test, y_pred_ranfor) * 100, 2), '%')
print('\n')
print('Boosting Method')
print('AdaBoost:', round(accuracy_score(y_test, y_pred_abc) * 100, 2), '%')
print('\n')
print('Voting Classifiers')
print('Voting Classifier without Weights:', round(accuracy_score(y_test, y_pred_vc)))
