In [1]:
library(NLP)
library(openNLP)
library(textstem)
library(stringr)
library(tm)
library(data.table)

Loading required package: koRpus.lang.en

Loading required package: koRpus

Loading required package: sylly

For information on available language packages for 'koRpus', run

  available.koRpus.lang()

and see ?install.koRpus.lang()



Attaching package: ‘tm’


The following object is masked from ‘package:koRpus’:

    readTagged




## 1. Creating Annotations

In [1]:
# Code Cell 1
data_folder <- "corpus/"
data_nickname <- "VietnamOrientArticles"

In [3]:
path <- data_folder
filenames <- dir(path)

In [4]:
#Code Cell 2
titles <- character()
rawtitles <- filenames
for (i in 1:length(rawtitles)){
    workingfilename <- unlist(strsplit(rawtitles[i], split = "_"))
    workingarticlename <- paste(c(workingfilename[3]), collapse = "")
    remove_txt <- gsub("\\.txt", "", workingarticlename)
    clean_name <- gsub("\\.", " ", remove_txt)
    titles[i] <- clean_name
}

In [5]:
#Code Cell 3
articles <- character()
for(i in 1:length(filenames)){ 
    text_v <- scan(paste(data_folder, filenames[i], sep = ""), what = "character", sep = "\n")
    body_start <- grep("@BODY=", text_v)
    if (length(body_start) > 0) {
      article_v <- paste(text_v[(body_start + 1):length(text_v)], collapse = " ")
      articles[i] <- article_v  
    } else {
      articles[i] <- NA
    }
}

In [6]:
#Code Cell 4
persons <- Maxent_Entity_Annotator(kind = 'person')
locations <- Maxent_Entity_Annotator(kind = 'location')
organizations <- Maxent_Entity_Annotator(kind = 'organization')
sent.token.annotator <- Maxent_Sent_Token_Annotator(language = "en")
word.token.annotator <- Maxent_Word_Token_Annotator(language = "en")
pos.tag.annotator <- Maxent_POS_Tag_Annotator(language = "en")

In [7]:
#Code Cell 5
t1 <- Sys.time() # get start time - from topic modeling code
annotated_articles <- list()
for(i in 1:length(articles)){
    annotations <- annotate(articles[i], list(sent.token.annotator,word.token.annotator,
                             pos.tag.annotator, persons,locations,organizations))
    ann.df <- as.data.frame(annotations)
    ann.df$features <- unlist(as.character(ann.df$features))
    ann.chars<-NULL
    for (r in 1:nrow(ann.df)){
        ann.chars[r]<- substr(articles[i],ann.df[r,3],ann.df[r,4])
    }
    ann.df$words<-ann.chars
    ann.df$article <- titles[i]
    annotated_articles[[i]] <- ann.df
}
annotated_articles_df <- do.call(rbind, annotated_articles) # rbind binds as rows
t2 <- Sys.time()
print(t2 - t1)

Time difference of 21.49891 secs


In [8]:
#Code Cell 6
dim(annotated_articles_df)
head(annotated_articles_df)

# type = words (get them)
# after, filter features to just be eitehr adjective/adverb
# remove dupes unique(df$words)

Unnamed: 0_level_0,id,type,start,end,features,words,article
Unnamed: 0_level_1,<int>,<chr>,<int>,<int>,<chr>,<chr>,<chr>
1,1,sentence,1,194,list(constituents = 48:83),Dulles wanted to make clear that the C.i.A. being a part of the United States government was a tool for the development of policy and not that of subversion against enemies of the United States.,DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING
2,2,sentence,197,369,list(constituents = 84:119),"CIA From OSS In trying to show why the Central Intelligence Agency was founded, Dulles felt it necessary to go back to the days of World War II, and the founding of the OSS.",DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING
3,3,sentence,371,512,list(constituents = 120:144),"At first the Government wanted to stop intelligence work after the war, but three major reasons made high official decide against this action.",DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING
4,4,sentence,515,573,list(constituents = 145:155),The first was the surprise Japanese attack on Pearl Harbor.,DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING
5,5,sentence,575,815,list(constituents = 156:195),"Dulles made it clear that the main reason fro the surprise element was not that the United States did not have information about Japanese intentions, but that the information available was not coordinated and submitted to proper authorities.",DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING
6,6,sentence,817,980,list(constituents = 196:225),"The Army, Navy and the F.B.I. did have a lot of information on the forthcoming Japanese attack, but unfortunately, this information never went beyond these sources.",DULLES REVIEWS CIA HISTORY OUTLINES US INTELLIGENCE NET SAYS BARGHON INCIDENT TYPICAL OF RUSSIAN MISUNDERSTANDING


In [9]:
#Code Cell 7
#Subset for just words - create vector of words
index <- which(annotated_articles_df$type == "word")
article_words <- unique(annotated_articles_df$words[index])
length(article_words)
head(article_words)

#Subset for just words - create a dataframe of words & POS
word_df <- annotated_articles_df[index, ]

In [10]:
#Code Cell 8
#Reduce articles to just NOUNS
#Find NN in word_df
index <- grep("NN", word_df$features)
noun_df <- word_df[index, ]
nouns <- unique(noun_df$words)
length(nouns)
head(nouns)

## 2. Create a document-term-matrix that only has nouns

In [11]:
# Code Cell 9
# Code Cell 2a from Stylometry
numbers <- "no"
periods <- "yes"
commas <- "yes"
question_marks <- "yes"
exclamation_marks <- "yes"#Code Cell 9

#Code Cell 4 from Stylometry
docs <- as.character()
for (i in 1:length(filenames)){
    text <- scan(paste(data_folder, filenames[i], sep = "/"), what = "\n", quote = "")
    docs[i] <- paste(text, collapse = " ")
}
docs <- gsub("[[:cntrl:]]", " ", docs)  # replace control characters with space
if(numbers == "no"){
    docs <- gsub("[[:digit:]]", "", docs) # remove numbers - replaces with blank
}
if(periods == "yes"){
    docs <- gsub("\\.", " prd ", docs) # replace period marks with "prd"
}
if(commas == "yes"){
    docs <- docs <- gsub("\\,", " cma ", docs) # replace commas with "cma"
}
if(question_marks == "yes"){
    docs <- gsub("\\?", " qst ", docs) # replace question marks with "qst"
}
if(exclamation_marks == "yes"){
    docs <- gsub("\\!", " exclm ", docs) # you see the pattern, right?
}

docs <- tolower(docs)

# Code Cell 5 from Stylometry
all_corpus_data <- list()

for (i in 1:length(docs)){
  split_all_text <- unlist(strsplit(docs[i], "\\W")) 
  clean_words = split_all_text[which(split_all_text != "")]
  #word_df <- as.data.frame(table(clean_words))
  word_df <- as.data.frame(100*table(clean_words)/length(clean_words))
  file_word_freq <- cbind(ID=titles[i], word_df)
  all_corpus_data[[i]] <- file_word_freq
}
corpus_data <- do.call(rbind, all_corpus_data)
colnames(corpus_data) = c("ID", "words", "Freq")

#Code Cell 6 from Stylometry
final_df <- dcast(corpus_data, ID~words, fill = 0)
class(final_df)
dim(final_df)
final_df[1:10,1:5]

Using 'Freq' as value column. Use 'value.var' to override

Aggregation function missing: defaulting to length



Unnamed: 0_level_0,ID,a,able,about,act
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,15TH ANNUAL ROTC REVIEW HONORS TWO VIETNAM WAR DEATHS,1,0,0,0
2,A LESSON IN DIPLOMACY AMBASSADOR ROSENNE SPEAKS,1,1,1,0
3,A LETTER FROM VIETNAM,1,0,0,0
4,ADMINISTRATION ASSAILED ON PEACER FEELER REPORTS,1,0,1,0
5,AN EARFUL OF PROPAGANDA,1,0,0,0
6,ANNUAL CAMPUS CHEST WEEKEND WILL AID VIETNAMESE CHILDREN,1,0,0,0
7,ANTI ANTIWAR PARADE A SUBJECTIVE VIEW DEMONSTRATION SEEN AS HISTORICAL PARADOX,1,0,0,0
8,BOWDOIN SDS MARCHES IN NYC,1,0,0,0
9,BOWDOIN SDS PLANS APRIL PROTEST WEEK,1,0,0,0
10,BOWDOIN SDS UNDERGOES CHANGES,1,1,1,0


In [12]:
#Code Cell 10 - Step 2:
#Subset by columns that match a vector of the nouns 
#Modified from Stylometry Code Cell 7
keepers <- tolower(nouns) # this line is the modification
cluster_data <- final_df[, which(colnames(final_df) %in% keepers)]
dim(cluster_data)

In [13]:
cluster_data$ID <- final_df$ID

In [14]:
head(cluster_data)

Unnamed: 0_level_0,a,act,action,advisor,agencies,agency,air,all,allen,american,⋯,playoffs,rumors,stretcher,sunshine,surrenders,sweater,traps,uniform,winner,ID
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,0,1,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,15TH ANNUAL ROTC REVIEW HONORS TWO VIETNAM WAR DEATHS
2,1,0,0,0,0,0,0,1,0,1,⋯,0,0,0,0,0,0,0,0,0,A LESSON IN DIPLOMACY AMBASSADOR ROSENNE SPEAKS
3,1,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,A LETTER FROM VIETNAM
4,1,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,ADMINISTRATION ASSAILED ON PEACER FEELER REPORTS
5,1,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,AN EARFUL OF PROPAGANDA
6,1,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,ANNUAL CAMPUS CHEST WEEKEND WILL AID VIETNAMESE CHILDREN


## Stemming

In [15]:
# Code Cell 11
stemmed_article_words <- stem_words(article_words)
length(stemmed_article_words)
head(stemmed_article_words)

In [16]:
# Code Cell 12
stemmed_articles <- stem_strings(articles, language = "porter") #http://snowball.tartarus.org/
#stemmed_speeches <- stem_strings(speeches, language = "porter2")
class(stemmed_articles)
length(stemmed_articles)
stemmed_articles[1]

## Lemmatization

In [17]:
# Code Cell 13
lemma_dictionary <- make_lemma_dictionary(articles, engine = 'hunspell')
class(lemma_dictionary)
dim(lemma_dictionary)
head(lemma_dictionary)
#lemma_dictionary2 <- make_lemma_dictionary(speeches, engine = 'lexicon')
#lemma_dictionary3 <- make_lemma_dictionary(speeches, engine = 'treetagger')

Unnamed: 0_level_0,token,lemma
Unnamed: 0_level_1,<chr>,<chr>
1,wanted,want
2,being,bee
3,united,unite
4,states,state
5,government,govern
6,development,develop


In [18]:
# Code Cell 14
lem_articles <- lemmatize_strings(articles, dictionary = lemma_dictionary)
class(lem_articles)
length(lem_articles)
lem_articles[1]