-
Notifications
You must be signed in to change notification settings - Fork 0
/
diabete_prediction.R
146 lines (100 loc) · 3.23 KB
/
diabete_prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
title: "R Notebook"
output: html_notebook
---
### Question 1 —
```{r}
#1.
library(readr)
# Load the dataset
data <- read.csv("/Users/nishabled/Desktop/Ms NEU/DA5020/diabetes.csv")
head(data)
# Summary statistics
summary(data)
```
### Question 2 —
```{r}
#2.
library(tidyverse)
# Normalize the explanatory variables using min-max normalization
normalized_diabetes <- data %>%
select(-Outcome) %>%
mutate(across(everything(), ~ (.-min(.)) / (max(.) - min(.))))
# Combine normalized explanatory variables with the response variable (Outcome)
normalized_diabetes <- cbind(normalized_diabetes, diabetes$Outcome)
# Inspect the first few rows of the normalized dataset
head(normalized_diabetes)
# Summary statistics of the normalized dataset
summary(normalized_diabetes)
```
```{r}
#3.
# Set seed for reproducibility
set.seed(123)
# Sample 80% of the rows for training set
train_index <- sample(1:nrow(normalized_data), 0.8 * nrow(normalized_data))
# Create training set
train_data <- normalized_data[train_index, ]
# Create test set
test_data <- normalized_data[-train_index, ]
# Check the dimensions of training and test sets
dim(train_data)
dim(test_data)
```
```{r}
#4.
# Define the distance function (Euclidean distance)
euclidean_distance <- function(x1, x2) {
return(sqrt(sum((x1 - x2)^2)))
}
# Define the knn_predict function
knn_predict <- function(train.data, test.data, k) {
# Initialize vector to store predictions
predictions <- c()
# Loop through each test observation
for (i in 1:nrow(test.data)) {
# Calculate distances between the test observation and all training observations
distances <- apply(train.data[, -ncol(train.data)], 1, function(x) euclidean_distance(x, test.data[i, -ncol(test.data)]))
# Combine distances with corresponding class labels
neighbors <- cbind(distances, train.data[, ncol(train.data)])
# Sort neighbors by distance
neighbors <- neighbors[order(neighbors[, 1]), ]
# Select the k nearest neighbors
k_nearest_neighbors <- neighbors[1:k, 2]
# Determine the majority class among the k nearest neighbors
predicted_class <- ifelse(sum(k_nearest_neighbors) >= k/2, 1, 0)
# Append the predicted class to the predictions vector
predictions <- c(predictions, predicted_class)
}
# Return the vector of predictions
return(predictions)
}
```
```{r}
#5.
# Example usage of knn_predict function with k = 6
knn_predict <- knn_predict(train.data = train_data, test.data = test_data, k = 6)
cat("knn_predict","\n",knn_predict, "\n")
# Analyze the results using a confusion matrix
conf_matrix <- table(Actual = test_data[, ncol(test_data)], Predicted = predicted_classes)
cat("conf_matrix k=6:", conf_matrix,"\n")
# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("acuracy k=6:",accuracy,"\n")
```
```{r}
#6.
library(ggplot2)
# K values
k_values <- seq(1, 40, by = 2)
# Vector to store MSE for all k
mse_values <- numeric(length(k_values))
# Loop for k
for (i in seq_along(k_values)) {
k <- k_values[i]
mse_values[i] <- knn.predict(data_train = train_data, data_test = test_data, k = k)
}
# k values and their MSEs
data_frame_k_mse <- data.frame(k = k_values, MSE = mse_values)
print(data_frame_k_mse)
```