In [None]:
# The datasets package needs to be loaded to access our data 
# For a full list of these datasets, type library(help = "datasets")
library(datasets)
data(iris)
summary(iris)

In [None]:
iris[1:5,1:5]

In [None]:
table(iris$Species)

In [None]:
boxplot(iris$Sepal.Length ~ iris$Species,ylab="Sepal.Length")

In [None]:
boxplot(iris$Sepal.Width ~ iris$Species,ylab="Sepal.Width")

In [None]:
boxplot(iris$Petal.Length ~ iris$Species,ylab="Petal.Length")

In [None]:
boxplot(iris$Petal.Width ~ iris$Species,ylab="Petal.Width")

In [None]:
plot(iris$Petal.Length,iris$Petal.Width,col=iris$Species,pch=16)
legend( x="topleft", 
    legend=levels(as.factor(iris$Species)),
    col=c("black","red","green"), 
    pch=c(16) )

In [None]:
plot(iris)

In [None]:
pairs(iris[1:4], main = "Iris Data", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

In [None]:
# First we create a copy of our dataset
iris_copy <- iris

# We know that missing values are identifed in R by the symbol NA
# We introduce several missing values in some columns
iris_copy$Sepal.Length[c(15, 20, 50, 67, 97, 118)] <- NA
iris_copy$Sepal.Width[c(4, 80, 97, 106)] <- NA
iris_copy$Petal.Length[c(5, 17, 35, 49)] <- NA

# Now we see that there are missing values in some columns
summary(iris_copy)

In [None]:
# The first thing we can do is to ask if there is any missing value in our table
length(which(is.na(iris_copy)))

In [None]:
# We can check that we introduced 14 missing values in the table

# There are several ways to identify rows containing NA's.
# First we will use the complete.cases function (check ?complete.cases for information)
# This function returns only rows without NA's. Putting ! in front of it we get only rows with NA's

iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA

In [None]:
# We see that we have 13 rows with missing values on it 

# Another way is to search for TRUE values in the is.na function

iris_NA <- iris_copy[rowSums(is.na(iris_copy)) > 0, ]
iris_NA

In [None]:
# In other cases we don't want to lose the information that we have in one row with missing values
# In this case we will substitute the missing value with a numerical value 

# The first thing we can do is to introduce the mean of a column in a missing value
# However it's more safe to use the median because it's not affected by outliers
# However we should be careful as in this case it's more correct to introduce the mean for the proper species
# We should do it column by column

iris_copy[is.na(iris_copy$Sepal.Length) & (iris_copy$Species == "setosa"),"Sepal.Length"] <- median(iris_copy$Sepal.Length[which(iris_copy$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA

In [None]:
# Now we have removed 3 NA's. Only 11 left

iris_copy[is.na(iris_copy$Sepal.Length) & (iris_copy$Species == "versicolor"),"Sepal.Length"] <- median(iris_copy$Sepal.Length[which(iris_copy$Species == "versicolor")], na.rm = TRUE)
iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA

In [None]:
# Now we have removed 2 NA's. Only 9 left

iris_copy[is.na(iris_copy$Sepal.Length) & (iris_copy$Species == "virginica"),"Sepal.Length"] <- median(iris_copy$Sepal.Length[which(iris_copy$Species == "virginica")], na.rm = TRUE)
iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA

In [None]:
# Now we have removed 1 NA's. Only 8 left

iris_copy[is.na(iris_copy$Sepal.Width) & (iris_copy$Species == "setosa"),"Sepal.Width"] <- median(iris_copy$Sepal.Width[which(iris_copy$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA

In [None]:
# Now we have removed 1 NA's. Only 7 left
iris_copy[is.na(iris_copy$Petal.Length) & (iris_copy$Species == "setosa"),"Petal.Length"] <- median(iris_copy$Petal.Length[which(iris_copy$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris_copy[!complete.cases(iris_copy), ]
iris_NA