# Load data

In [None]:
data_fpath <- '../av_survey_data/bikepgh_av_survey.csv' # fill the path to this file here
text_col_classes = c(
    'interaction_details'='character',
    'positive_av_interaction'='character',
    'negative_av_interaction'='character',
    'other_av_regulations'='character',
    'elaborate_bikepgh_position'='character',
    'other_comments'='character'
                )
survey_data <- read.csv(data_fpath, colClasses=text_col_classes, na.strings=c(''))
print(nrow(survey_data))
print(sapply(survey_data, class))
survey_data

# Choose one of the text fields

In [None]:
text_colnames <- c(
    'interaction_details',
    'positive_av_interaction',
    'negative_av_interaction',
    'other_av_regulations',
    'elaborate_bikepgh_position',
    'other_comments'
                )
colname <-  # your selected column name goes here

In [None]:
# Get non-empty rows from that columns
filtered_data <- subset(survey_data, !is.na(survey_data[colname]))
nrow(filtered_data)

# Tokenize (split text into words)
This may seem trivial, but you'll want to detach punctuation from words, since "person" and "person," aren't very different. And what about contractions such as "I'm"? Will you want to lowercase everything or is there some distinction between "polish" and "Polish" you'd want to preserve?

You'll also want to think about "stopwords", function words such as "the" and "and", or "or" and "that". Counts for these words are often distracting to machine learning models, and they're often removed unless there may be important or meaningful variation in stopword usage.

In [None]:
library(tidytext)
library(dplyr)

In [None]:
sapply(filtered_data, class)

In [None]:
tokenized_data <- unnest_tokens(filtered_data, word, !!colname)
nrow(tokenized_data)

In [None]:
# Remove stopwords
tokenized_data <- anti_join(tokenized_data, get_stopwords())
nrow(tokenized_data)

# Extract features (words to numbers)
One of the simplest ways to get documents into numeric format for machine learning is to simply count each unique word and treat each document as collection of these counts. For example, "the dog barked loudly at the hat" would become {the: 2, dog: 1, barked: 1, loudly: 1, at: 1, hat: 1}. Each unique word in the vocabulary is usually given an ID. Because order information is lost, this is referred to as the "bag-of-words" model of documents.

In [None]:
# Make word counts
word_counts <- tokenized_data %>% count(participant_id, word, sort=TRUE)
word_counts

In [None]:
# Make document-term matrix

dtm <- word_counts %>% cast_dtm(participant_id, word, n)
dtm

# Run LDA
Now let's let LDA find topics. Here you'll want to vary the number of topics and compare results in the interpretation later. Start with 5 or 10 and go up to as much as you feel comfortable trying to interpret.

In [None]:
library(topicmodels)

In [None]:
lda <- LDA(dtm, k=10, control=list(seed=9))
lda

# Interpretation
This is one of the tougher parts. You'll examine the words and documents given the highest probability for each topic and see if they make any sense (they might not). If they don't, go back and change the number of topics, change preprocessing (tokenization, etc), or throw up your hands and tell me how terrible topic modeling is :)

## Top words/topic

In [None]:
lda_topics <- tidy(lda, matrix='beta')

top_topic_terms <- lda_topics %>% 
    group_by(topic) %>%
    top_n(5, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)

top_topic_terms

## Top documents/topic

In [None]:
lda_topics <- tidy(lda, matrix='gamma')

top_topic_docs <- lda_topics %>% 
    group_by(topic) %>%
    top_n(5, gamma) %>%
    ungroup() %>%
    arrange(topic, -gamma)

top_topic_docs <- merge(top_topic_docs, filtered_data, by.x='document', by.y='participant_id', all.x=TRUE)[,c('document', 'topic', 'gamma', colname)]
top_topic_docs <- top_topic_docs[order(top_topic_docs$topic),]
top_topic_docs

## See how distribution of other fields varies across topics
Here, you can "assign" documents to their highest-ranking topic and see how other fields vary across topics