# Text Mining Terrorism Dataset

## *Basic Descriptive Statistics*

### 1 - Set environment

In [None]:
setwd("~/tum-data-mining-lab")

library(wordcloud)
library(tm)
library(dplyr)
library(openxlsx)

mydf <- read.xlsx("globalterrorismdb_0616dist.xlsx", sheet = 1, startRow = 1, colNames = TRUE)

### 2 - Goal: analyse following columns

In [None]:
# columns: summary
#mydf$summary
#mydf$alternative_txt
#mydf$motive
#mydf$weapdetail

### 3 - Utils Functions

#### Frequencies of missing values

In [None]:
#frequencies of na (relative)
relative_na_frequencies <- function(column_to_process)
{
  count_na <- sum(is.na(mydf$summary))
  count_not_na <- sum(!is.na(mydf$summary))
  value <- count_na/(count_na + count_not_na)
  print(value)
}

#### Wordclouds

In [None]:
generate.cloud <- function(column_to_process, output_filename)
{
  #remove punctuation
  column_to_process <- gsub("[[:punct:]]", "", column_to_process)
  
  # Create corpus
  corpus=Corpus(VectorSource(column_to_process))
  
  # Convert to lower-case
  corpus=tm_map(corpus,tolower)
  
  # Remove stopwords
  corpus=tm_map(corpus,function(x) removeWords(x,stopwords()))
  
  # convert corpus to a Plain Text Document
  corpus=tm_map(corpus,PlainTextDocument)
  
  col=brewer.pal(6,"Dark2")
  wordcloud(corpus, min.freq=25, scale=c(5,2),rot.per = 0.25,
            random.color=T, max.word=45, random.order=F,colors=col)
}

### 4 - Frequencies and wordclouds for terrorism dataset

#### 4.1 - Variable summary

In [None]:
relative_na_frequencies(mydf$summary)
generate.cloud(mydf$summary, "summary")

#### 4.2 - Variable alternative_txt

In [None]:
relative_na_frequencies(mydf$alternative_txt)
generate.cloud(mydf$alternative_txt, "alternative_txt")

#### 4.3 - Variable motive

In [None]:
relative_na_frequencies(mydf$motive)
generate.cloud(mydf$motive, "motive")

#### 4.4 - Variable weapdetail

In [None]:
relative_na_frequencies(mydf$weapdetail)
generate.cloud(mydf$weapdetail, "weapdetail")