# Data Preparation: Generating N-grams
In the following notebook, I use a large text corpus to generate N-grams on a Spark cluster using the Sparklyr package. This is part of the my capstone project from the Johns Hopkins University's Data Science Specialization on Coursera. The project aims at building a word prediction application.

The code below:
*   Cleans the data of all unwanted characters and breaks lines into sequences
*   Generates n-grams from 1-grams to 5-grams with count. For 2-grams to 5-grams, these are then split into "firstTerms" and "lastTerm".



## Setting up environment

Installing and loading required packages

In [None]:
install.packages("sparklyr")
install.packages("lexicon")

In [None]:
library(sparklyr)
library(lexicon)

Installing spark and connecting to cluster

In [None]:
spark_install(version = "3.0.1")

In [None]:
sc <- spark_connect(master = "local")

## Loading Data

Data available [here](https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip).  
*Note that data needs to be uploaded to session's storage first*

In [None]:
alltxt <- sdf_bind_rows(spark_read_text(sc, path ="en_US.blogs.txt"),
                        spark_read_text(sc, path ="en_US.news.txt"),
                        spark_read_text(sc, path ="en_US.twitter.txt")) %>%
  sdf_with_sequential_id(id = "id")

## Data Cleaning

In [None]:
profanity_regex <- paste0(lexicon::profanity_banned, collapse="|")

In [None]:
cleanedtxt <- alltxt %>%
  mutate(line=regexp_replace(line, "''|’", "'")) %>% # Normalizing apostrophes
  mutate(line=regexp_replace(line, '“|”', '"')) %>% #Normalizing quotes
  mutate(line=regexp_replace(line, "_","#")) %>% # Clearing the placeholder character
  mutate(line=regexp_replace(line,"[,.!?;:\"()]"," _ ")) %>% # Spacing words from characters sequence breaks
  mutate(line=regexp_replace(line,"^['-]|['-]$|(?<![a-zA-Z])['-]|['-](?![a-zA-Z])"," _ ")) %>% # Dealing with apostrophes and hyphens outside of words
  mutate(line=regexp_replace(line,"[^-a-zA-Z' ]","_")) %>% # Removing everything else, except letters, and apostrophes and hyphens inside a word
  mutate(line=regexp_replace(line,"  +"," ")) %>% # Removing white spaces between words
  mutate(line=tolower(line)) %>%
  mutate(line=regexp_replace(line,profanity_regex, "_")) %>% # Removing profanities
  select(id, line)

In [None]:
cleanedtxt %>%
  sdf_repartition(1) %>%
  spark_write_csv("./cleaned")

## Generating n-grams

Tokenization

In [None]:
toks <- cleanedtxt %>%
  ft_tokenizer(input_col="line", output_col="tokens")

Unigrams

In [None]:
unigrams <- toks %>%
  mutate(ngrams=explode(tokens)) %>%
  filter(!grepl("_",ngrams)) %>% # Removing placeholders created earlier
  group_by(ngrams) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))


In [None]:
unigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./unigrams")

Bigrams

In [None]:
bigrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=2) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>% 
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("firstTerms", "lastTerm")) %>%
  group_by(firstTerms, lastTerm) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))

In [None]:
bigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./bigrams")

Trigrams

In [None]:
trigrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=3) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2","lastTerm")) %>%
  mutate(firstTerms = paste(word1, word2)) %>%
  group_by(firstTerms, lastTerm) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))

In [None]:
trigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./trigrams")

Fourgrams

In [None]:
fourgrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=4) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2","word3","lastTerm")) %>%
  mutate(firstTerms = paste(word1, word2, word3)) %>%
  group_by(firstTerms, lastTerm) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))

In [None]:
fourgrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./fourgrams")

Fivegrams

In [None]:
fivegrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=5) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2","word3","word4","lastTerm")) %>%
  mutate(firstTerms = paste(word1, word2, word3, word4)) %>%
  group_by(firstTerms, lastTerm) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))

In [None]:
fivegrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./fivegrams")