In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

data_path = "Cleaned_Dataset.csv"
cleaned_data = pd.read_csv(data_path)

cleaned_data['RIDAGEYR'] = pd.to_numeric(cleaned_data['RIDAGEYR'], errors='coerce')
cleaned_data.dropna(subset=['RIDAGEYR', 'RIAGENDR'], inplace=True)

label_encoder = LabelEncoder()
cleaned_data['RIAGENDR'] = label_encoder.fit_transform(cleaned_data['RIAGENDR'])

X = cleaned_data[['RIDAGEYR']]
y = cleaned_data['RIAGENDR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
predicted_counts = pd.Series(y_pred).value_counts()

categories = label_encoder.inverse_transform([0, 1])

print(f"Predicted Counts:\n{categories[0]} (Female): {predicted_counts.get(0, 0)}\n{categories[1]} (Male): {predicted_counts.get(1, 0)}")

plt.figure(figsize=(10, 5))
plt.bar(categories, predicted_counts, color=['blue', 'pink'])
plt.title("Predicted Gender Counts (KNN)")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(10, 5))
plt.scatter(X_test, y_pred, c=y_pred, cmap='bwr', alpha=0.6)
plt.title("Scatter Plot of Age vs Predicted Gender (KNN)")
plt.xlabel("Age (RIDAGEYR)")
plt.ylabel("Predicted Gender (0: Female, 1: Male)")
plt.colorbar(label="Gender")
plt.show()
