# Word Prediction Project
The following notebook is part 1 of the Coursera Data Science Specialization's capstone project, where I clean and transform a large corpus of text data into n-grams on a Spark cluster using the Sparklyr package.

## Setting up environment

Installing and loading required packages

In [None]:
install.packages("tidytext")
install.packages("sparklyr")
install.packages("lexicon")

In [None]:
library(tidyverse)
library(tidytext)
library(sparklyr)
library(lexicon)

Installing spark and connecting to cluster

In [None]:
spark_install(version = "3.0.0")

In [None]:
sc <- spark_connect(master = "local")

## Data Cleaning

Loading the data*  
Available here: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip  
*Data needs to be uploaded to session's storage first

In [None]:
alltxt <- sdf_bind_rows(spark_read_text(sc, path ="en_US.blogs.txt"),
                        spark_read_text(sc, path ="en_US.news.txt"),
                        spark_read_text(sc, path ="en_US.twitter.txt")) %>%
  sdf_with_sequential_id(id = "id")

Cleaning the data

In [None]:
profanity_regex <- paste0(lexicon::profanity_banned, collapse="|")

In [None]:
cleanedtxt <- alltxt %>%
  mutate(line=regexp_replace(line, "’", "'")) %>% # Normalizing apostrophes
  mutate(line=regexp_replace(line,"[^a-zA-Z' ]"," _ ")) %>% # Create a placeholder for empty character so they can be filtered out later
  mutate(line=tolower(line)) %>%
  mutate(line=regexp_replace(line,profanity_regex, " _ ")) %>% # Removing profanities
  mutate(line=regexp_replace(line," '|' |^'|'$"," _ ")) %>% # Remove apostrophes at the beginning or end of a word 
  mutate(line=regexp_replace(line,"  "," ")) %>% # Removing white spaces that might have been created
  select(id, line)

In [None]:
cleanedtxt %>%
  sdf_repartition(1) %>%
  spark_write_csv("./clean")

## Creating n-grams

Tokenization

In [None]:
toks <- cleanedtxt %>%
  ft_tokenizer(input_col="line", output_col="tokens")

Unigrams

In [None]:
unigrams <- toks %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  select(words) %>%
  summarise(n=n()) %>%
  filter(n>1) %>%
  arrange(desc(n))


In [None]:
unigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./unigram")

Bigrams

In [None]:
bigrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=2) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>% # Removing placeholder created earlier
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2")) %>%
  select(word1, word2) %>%
  summarise(n=n()) %>%
  filter(n>5) %>%
  arrange(desc(n))

In [None]:
bigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./bigram")

Trigrams

In [None]:
trigrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=3) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2","word3"))
  group_by(word1, word2, word3) %>%
  summarise(n=n()) %>%
  filter(n>5) %>%
  arrange(desc(n))

In [None]:
trigrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./trigrams")

Fourgrams

In [None]:
fourgrams <- toks %>%
  ft_ngram(input_col = "tokens", output_col = "words", n=3) %>%
  mutate(ngrams=explode(words)) %>%
  filter(!grepl("_",ngrams)) %>%
  ft_regex_tokenizer(input_col="ngrams", output_col="split", pattern=" ") %>% 
  sdf_separate_column("split", into=c("word1", "word2","word3","word4")) %>%
  group_by(word1, word2, word3, word4) %>%
  summarise(n=n()) %>%
  filter(n>5) %>%
  arrange(desc(n))

In [None]:
fourgrams %>%
  sdf_repartition(1) %>%
  spark_write_csv("./fourgrams")