# 14 Manipulating Data

See: [Cookbook for R: Manipulating Data by Winston Chang](http://www.cookbook-r.com/Manipulating_data/) 

In [1]:
# Make up a randomly ordered vector
v <- sample(101:110)

# Sort the vector
sort(v)
#>  [1] 101 102 103 104 105 106 107 108 109 110

# Reverse sort
sort(v, decreasing=TRUE)
#>  [1] 110 109 108 107 106 105 104 103 102 101

In [7]:
# Make a data frame
df <- data.frame (id=1:4,
            weight=c(20,27,24,22),
            size=c("small", "large", "medium", "large"))
df

library(plyr)

# Sort by weight column. These have the same result.
arrange(df, weight)       # Use arrange from plyr package
df[ order(df$weight), ]   # Use built-in R functions
#>   id weight   size
#> 1  1     20  small
#> 2  4     22  large
#> 3  3     24 medium
#> 4  2     27  large


# Sort by size, then by weight
arrange(df, size, weight)         # Use arrange from plyr package
df[ order(df$size, df$weight), ]  # Use built-in R functions
#>   id weight   size
#> 4  4     22  large
#> 2  2     27  large
#> 3  3     24 medium
#> 1  1     20  small


# Sort by all columns in the data frame, from left to right
df[ do.call(order, as.list(df)), ] 
# In this particular example, the order will be unchanged

id,weight,size
1,20,small
2,27,large
3,24,medium
4,22,large


id,weight,size
1,20,small
4,22,large
3,24,medium
2,27,large


Unnamed: 0,id,weight,size
1,1,20,small
4,4,22,large
3,3,24,medium
2,2,27,large


id,weight,size
4,22,large
2,27,large
3,24,medium
1,20,small


Unnamed: 0,id,weight,size
4,4,22,large
2,2,27,large
3,3,24,medium
1,1,20,small


id,weight,size
1,20,small
2,27,large
3,24,medium
4,22,large


In [3]:
# Reverse sort by weight column. These all have the same effect:
arrange(df, -weight)                      # Use arrange from plyr package
df[ order(df$weight, decreasing=TRUE), ]  # Use built-in R functions
df[ order(-df$weight), ]                  # Use built-in R functions
#>   id weight   size
#> 2  2     27  large
#> 3  3     24 medium
#> 4  4     22  large
#> 1  1     20  small


# Sort by size (increasing), then by weight (decreasing)
arrange(df, size, -weight)         # Use arrange from plyr package
df[ order(df$size, -df$weight), ]  # Use built-in R functions
#>   id weight   size
#> 2  2     27  large
#> 4  4     22  large
#> 3  3     24 medium
#> 1  1     20  small


# Sort by size (decreasing), then by weight (increasing)
# The call to xtfrm() is needed for factors
arrange(df, -xtfrm(size), weight)         # Use arrange from plyr package
df[ order(-xtfrm(df$size), df$weight), ]  # Use built-in R functions
#>   id weight   size
#> 1  1     20  small
#> 3  3     24 medium
#> 4  4     22  large
#> 2  2     27  large

id,weight,size
2,27,large
3,24,medium
4,22,large
1,20,small


Unnamed: 0,id,weight,size
2,2,27,large
3,3,24,medium
4,4,22,large
1,1,20,small


Unnamed: 0,id,weight,size
2,2,27,large
3,3,24,medium
4,4,22,large
1,1,20,small


id,weight,size
2,27,large
4,22,large
3,24,medium
1,20,small


Unnamed: 0,id,weight,size
2,2,27,large
4,4,22,large
3,3,24,medium
1,1,20,small


id,weight,size
1,20,small
3,24,medium
4,22,large
2,27,large


Unnamed: 0,id,weight,size
1,1,20,small
3,3,24,medium
4,4,22,large
2,2,27,large


In [3]:
# sort vector
x <- sample(101:110)
x
y <- sort(x)
y
z <- sort(x, decreasing=TRUE)
z


In [6]:
# sort a data frame
df <- data.frame (id=1:4, weight=c(20,27,24,22), size=c("small", "large", "medium", "large"))
df

# These have the same result:
df[ order(df$weight), ]   #Sort by weight column usin built-in R functions
library(plyr)
# remove duplicate records from data frame

arrange(df, weight)       # Sort by weight column using arrange from plyr package

id,weight,size
1,20,small
2,27,large
3,24,medium
4,22,large


Unnamed: 0,id,weight,size
1,1,20,small
4,4,22,large
3,3,24,medium
2,2,27,large


id,weight,size
1,20,small
4,22,large
3,24,medium
2,27,large


In [7]:
# remove duplicate records from vector
set.seed(158)
x <- round(rnorm(20, 10, 5))
x
duplicated(x)
x[duplicated(x)]
unique(x[duplicated(x)])
unique(x)
x[!duplicated(x)]

In [8]:
# remove duplicate records from data frame
df <- read.table(header=TRUE, text='
 label value
     A     4
     B     3
     C     6
     B     3
     B     1
     A     2
     A     4
     A     4
')
duplicated(df)
df[duplicated(df),]
unique(df[duplicated(df),])
unique(df)
df[!duplicated(df),]

Unnamed: 0,label,value
4,B,3
7,A,4
8,A,4


Unnamed: 0,label,value
4,B,3
7,A,4


Unnamed: 0,label,value
1,A,4
2,B,3
3,C,6
5,B,1
6,A,2


Unnamed: 0,label,value
1,A,4
2,B,3
3,C,6
5,B,1
6,A,2
