In [1]:
?airquality

- `Ozone`: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island
- `Solar.R`: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park
- `Wind`: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport
- `Temp`: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.

In [2]:
str(airquality)

In [3]:
summary(airquality)

- `NA`
    - **Ozone** 37, **Solar.R**  7 

In [4]:
df <-airquality

## 1. Checking Missing Values's Patterns

In [5]:
sum(is.na(df))

In [6]:
library(ggplot2)

In [9]:
plot_Missing <- function(data_in, title = NULL){
  temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
  temp_df <- temp_df[,order(colSums(temp_df))]
  data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
  data_temp$m <- as.vector(as.matrix(temp_df))
  data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
  ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("white", "black"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}

In [10]:
plot_Missing(df[,colSums(is.na(df))>0])

## 2. Imputation

### 2.1. Ozone

In [11]:
o<-order(df$Ozone, decreasing=TRUE)
head(df[o,],10)

In [12]:
df<-df[o,]

In [13]:
df$Ozone<-ifelse(is.na(df$Ozone),df$Ozone[10], df$Ozone)

In [14]:
tail(df)

In [15]:
boxplot(df$Ozone)

In [20]:
#raw mean of the Ozone 
m1<-mean(airquality$Ozone, na.rm=T) ; m1

In [18]:
#imputation
m2<-mean(df$Ozone) ; m2

In [21]:
abs(round(m1-m2,2))

### 2.2. Solar.R

In [23]:
df$Solar.R<-ifelse(is.na(df$Solar.R),mean(df$Solar.R, na.rm=T), df$Solar.R)

## 3. Scaling

### 3.1 Min-Max scaling

In [24]:
min_max_norm <- function(x) {
    (x - min(x)) / (max(x) - min(x))
  }

In [25]:
df_norm <- as.data.frame(lapply(df[1:2], min_max_norm))

In [26]:
head(df_norm)

In [27]:
max(df_norm[1])-min(df_norm[2])