In [1]:
# Some string manipulation
fourcities <- c("Toronto", "Canberra", "New York", "London")
# display in alphabetical order
sort(fourcities)
# Find the number of characters in "Toronto"
nchar("Toronto")
# Find the number of characters in all four cities
nchar(fourcities)

In [2]:
# The example belows installs the stringr package
# See: https://cran.r-project.org/web/packages/stringr/vignettes/stringr.html
# which is probably in your installation of R by default
# install.packages("stringr", repos='http://cran.us.r-project.org')

# The packages are loades with library(), see also require()
library("stringr")
str_length(fourcities) # Get length of strings

x <- c("abcdef", "ghifjk")
str_sub(x, 3, 3) # Get the 3rd letter

# The 2nd to 2nd-to-last character
str_sub(x, 2, -2)

# str_sub also modifies strings, note the insertion of "X"
str_sub(x, 3, 3) <- "X"
x

# To duplicate individual strings, you can use str_dup():

str_dup(x, c(2, 3)) # interpret the result of this command

In [3]:
# More on strings

# str_pad() pads a string to a fixed length by adding 
# extra whitespace on the left, right, or both sides.
# str_pad(string, width, side = c("left", "right", "both"), pad = " ")

x <- c("abc", "defghi")
str_pad(x, 10)
str_pad(x, 10, "both")
str_pad(x, 10, "right", pad="0") # You can pad with other characters

x <- c("Short", "This is a long string")

x %>% 
  str_trunc(10) %>% 
  str_pad(10, "right")

# The pipe operator, %>%, is used to insert an argument into a function.
# It is not a base feature of the language and can only be used 
# after attaching a package that provides it
# Try to get the same result doing a step by step usage of the functions.
# Display the function help with
# ?str_trunc
# type the q letter to quit the help
# What about that ellipsis (three dots)?

In [4]:
# More on strings

# str_trim() removes leading and trailing whitespaces

x <- c("  a   ", "b   ",  "   c")
str_trim(x)
str_trim(x, "left")

# Use str_wrap() to modify existing whitespace in order 
# to wrap a paragraph of text so that the length of 
# each line as a similar as possible.

jabberwocky <- str_c(
  "`Twas brillig, and the slithy toves ",
  "did gyre and gimble in the wabe: ",
  "All mimsy were the borogoves, ",
  "and the mome raths outgrabe. "
)
cat(str_wrap(jabberwocky, width = 20)) 

# Some str functions are local sensitive (locale = language code)
# You can see a complete list of available locales by running 
# stringi::stri_locale_list().

x <- "I like horses."
str_to_upper(x)
str_to_title(x)
str_to_lower(x)

# Turkish has two sorts of i: with and without the dot
str_to_lower(x, "tr")

# And string ordering and sorting:
x <- c("y", "i", "k")
str_order(x)

str_sort(x)
# In Lithuanian, y comes between i and k
str_sort(x, locale = "lt")

`Twas brillig, and
the slithy toves did
gyre and gimble in
the wabe: All mimsy
were the borogoves,
and the mome raths
outgrabe.

In [5]:
# Pattern matching

strings <- c(
  "apple", 
  "219 733 8965", 
  "329-293-8753", 
  "Work: 579-499-7527; Home: 543.355.3679"
)
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

# Which strings contain phone numbers?
str_detect(strings, phone) # Returns a logical vector
str_subset(strings, phone) # Returns the matching elements
str_count(strings, phone) # How many phones  in each string
str_locate(strings, phone) # Position of first occurence of the pattern
str_locate_all(strings, phone) # All occurrences of the pattern

start,end
,
1.0,12.0
1.0,12.0
7.0,18.0


start,end

start,end
1,12

start,end
1,12

start,end
7,18
27,38


In [6]:
# More on Pattern matching
# Try to figure what the functions are doing
# You can always ask for help with ?function

str_match(strings, phone)
str_match_all(strings, phone)
str_replace(strings, phone, "XXX-XXX-XXXX")
str_replace_all(strings, phone, "XXX-XXX-XXXX")
str_split("a-b-c", "-")
str_split_fixed("a-b-c", "-", n = 2)

0,1,2,3
,,,
219 733 8965,219.0,733.0,8965.0
329-293-8753,329.0,293.0,8753.0
579-499-7527,579.0,499.0,7527.0


0,1,2,3
219 733 8965,219,733,8965

0,1,2,3
329-293-8753,329,293,8753

0,1,2,3
579-499-7527,579,499,7527
543.355.3679,543,355,3679


0,1
a,b-c


In [7]:
# Engines to describe patterns
# fixed vs collation rules

a1 <- "\u00e1" # single character
a2 <- "a\u0301" # a with accent
c(a1, a2)
a1 == a2 # Two different representations of á
# fixed fails since the characters are bytewise different
str_detect(a1, fixed(a2)) 
str_detect(a1, coll(a2)) # coll respect human comparison rules

# Collation search
i <- c("I", "İ", "i", "ı")
# Interpret the following result
str_subset(i, coll("i"))
str_subset(i, coll("i", ignore_case = TRUE))
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))

In [8]:
# Boundaries

x <- "This is a sentence."
str_split(x, boundary("word"))
str_count(x, boundary("word"))
str_extract_all(x, boundary("word"))

# By convention, "" is treated as boundary("character"):

str_split(x, "")
str_count(x, "")

str_split(x, " ") # separated by space
str_split(x, "s") # Figure out
str_split(x, "e") # Figure out