In [1]:
%history

%history


In [2]:
%history -g -f lab1_rec.py


In [3]:
%history

%history -g -f lab1_rec.ipynb
%history -g -f lab1_rec.py
%history


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC

In [None]:
data = pd.read_csv('medical_examination.csv')
data.columns
data.info()
data.shape

In [None]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()

# If there are missing values, propose algorithms to fill them
if missing_values.any():
    missing_values = missing_values[missing_values > 0]
    missing_values
else:
    print("No missing values")

In [None]:
#Compute the correlation matrix
corr_matrix = data.corr()

# Create a heatmap for the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True, square=True)

# Add title
plt.title('Correlation Heatmap of Medical Examination Data')
plt.show()

In [None]:
# Selecting a few key features for the histograms and boxplots
key_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']

# Plotting histograms for key features
plt.figure(figsize=(12, 10))
for i, feature in enumerate(key_features):
    plt.subplot(3, 3, i + 1)
    plt.hist(data[feature], bins=30, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Plotting boxplots of key features relative to the target variable 'cardio'
plt.figure(figsize=(12, 10))
for i, feature in enumerate(key_features):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x='cardio', y=feature, data=data)
    plt.title(f'{feature} by Cardiovascular Disease')
    plt.xlabel('Cardiovascular Disease')
    plt.ylabel(feature)
plt.tight_layout()
plt.show()

In [None]:
features_to_normalize = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']
scaler = StandardScaler()
normalized_data = data.copy()
normalized_data[features_to_normalize] = scaler.fit_transform(data[features_to_normalize])

normalized_data.head()

In [None]:
# Display summary statistics for the normalized data
summary_statistics = normalized_data[features_to_normalize].describe()

summary_statistics

In [None]:
# Splitting the data into training and testing sets
X = normalized_data.drop(columns=['cardio', 'id', 'gender', 'smoke', 'alco', 'active'])
y = normalized_data['cardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# 1. k-Nearest Neighbors (kNN) - GridSearch for optimal n_neighbors
knn_params = {'n_neighbors': [3, 5, 7]}  # Reduced k values
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)
predict = knn_grid.predict(X_test)

In [None]:
#evaluation
print(confusion_matrix(y_test,predict))
print(classification_report(y_test,predict))
print(accuracy_score(y_test,predict))

In [None]:
#Choosing K-value
error_rate = []

for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,30), error_rate, marker='o', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.grid()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
decision_tree.predict(X_test)

In [None]:
model = SVC(C=1.0, gamma = 1.0, kernel = 'rbf')
model.fit(X_train,y_train)
svm_pred = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))
print(confusion_matrix(y_test,pred))