# An Exploration of Text Modeling with News Text Over Time

##### I'll be exploring different ways to analyze text data using different topic modeling, word/document embedding, and time-series methodologies.

##### Historical CNN stories have been sourced from https://cs.nyu.edu/~kcho/DMQA/

### Common code for all methods

In [11]:
library(tidyverse)
library(tidytext)
library(stringr)

In [36]:
data(stop_words)

In [10]:
data_dir <- 'cnn/stories/'

In [12]:
story_files <- list.files(data_dir)

In [44]:
story_files[1]

### Tokenizing sample documents

In [45]:
story_samples <- data_frame(story = paste0(data_dir, story_files[1:100])) %>%
    mutate(text = map(story, read_lines))

In [46]:
story_samples <- story_samples %>% unnest() %>%
    filter(text != '') %>% mutate(story = basename(story)) %>% group_by(story) %>%
    mutate(linenumber = row_number()) %>% ungroup()

In [47]:
tidy_story_samples <- story_samples %>% unnest_tokens(word, text)

In [48]:
tidy_story_samples <- tidy_story_samples %>% anti_join(stop_words, by = 'word')

In [49]:
# simple word count
tidy_story_samples %>% count(word, sort = TRUE) %>% head()

word,n
highlight,352
people,162
cnn,142
u.s,141
government,98
united,96


### TF-IDF

In [53]:
tfidf_story_words <- story_samples %>% unnest_tokens(word, text) %>% 
    count(story, word, sort = TRUE) %>% ungroup()

In [56]:
tfidf_words <- tfidf_story_words %>% bind_tf_idf(word, story, n)

In [58]:
tfidf_words %>% arrange(desc(tf_idf))

story,word,n,tf,idf,tf_idf
00465603227f7f56fcd37e10f4cd44e57d7647d8.story,snapshots,2,0.05555556,4.605170,0.2558428
0044e296ecfe3ba57a351ad2a36d034491e878ce.story,crosby,11,0.04700855,4.605170,0.2164824
00465603227f7f56fcd37e10f4cd44e57d7647d8.story,cnn.com,2,0.05555556,3.506558,0.1948088
00465603227f7f56fcd37e10f4cd44e57d7647d8.story,gallery,2,0.05555556,3.506558,0.1948088
0036c48d80c270465bffced3e233fe39e5950431.story,titanic,13,0.04024768,4.605170,0.1853474
000940f2bb357ac04a236a232156d8b9b18d1667.story,kasem,18,0.03578529,4.605170,0.1647973
0015194573f9b4430319683cde41e4aa17092a9d.story,shark,7,0.03500000,4.605170,0.1611810
000c835555db62e319854d9f8912061cdca1893e.story,cardinals,15,0.03355705,4.605170,0.1545359
002c962834b7886c600a31a35053543e324883bd.story,hinckley,36,0.03351955,4.605170,0.1543632
002c715ea1428373cc432c9508d4a48d2e6069f4.story,app,16,0.03603604,3.912023,0.1409738
