# Text Mining Terrorism Dataset

## *Basic Descriptive Statistics*

### 1 - Set environment

In [None]:
setwd("~/tum-data-mining-lab")

library(wordcloud)
library(tm)
library(dplyr)
library(openxlsx)

mydf <- read.xlsx("globalterrorismdb_0616dist.xlsx", sheet = 1, startRow = 1, colNames = TRUE)

### 2 - Goal: analyse following columns

In [None]:
# columns: summary
#mydf$summary
#mydf$alternative_txt
#mydf$motive
#mydf$weapdetail

### 3 - Utils Functions

#### Frequencies of missing values

In [None]:
#frequencies of na (relative)
relative_na_frequencies <- function(column_to_process)
{
  count_na <- sum(is.na(column_to_process))
  count_not_na <- sum(!is.na(column_to_process))
  value <- count_na/(count_na + count_not_na)
  print(value)
}

#### Wordclouds

In [None]:
generate.cloud <- function(column_to_process, output_filename)
{
  #remove punctuation
  column_to_process <- gsub("[[:punct:]]", "", column_to_process)
  
  # Create corpus
  corpus=Corpus(VectorSource(column_to_process))
  
  # Convert to lower-case
  corpus=tm_map(corpus,tolower)
  
  # Remove stopwords
  corpus=tm_map(corpus,function(x) removeWords(x,stopwords()))
  
  # convert corpus to a Plain Text Document
  corpus=tm_map(corpus,PlainTextDocument)
  
  col=brewer.pal(6,"Dark2")
  wordcloud(corpus, min.freq=25, scale=c(5,2),rot.per = 0.25,
            random.color=T, max.word=45, random.order=F,colors=col)
}

### 4 - Frequencies and wordclouds for terrorism dataset

#### 4.1 - Variable summary

In [None]:
relative_na_frequencies(mydf$summary)
#na relative frequencies = 0.4218866

generate.cloud(mydf$summary, "summary")

#### 4.2 - Variable alternative_txt

In [None]:
relative_na_frequencies(mydf$alternative_txt)
#na relative frequencies = 0

generate.cloud(mydf$alternative_txt, "alternative_txt")

#### 4.3 - Variable motive

In [None]:
relative_na_frequencies(mydf$motive)
#na relative frequencies = 0.7028041

generate.cloud(mydf$motive, "motive")

#### 4.4 - Variable weapdetail

In [None]:
relative_na_frequencies(mydf$weapdetail)
#na relative frequencies = 0.3249177

generate.cloud(mydf$weapdetail, "weapdetail")

#### 4.5 - Variable propcomment

In [None]:
relative_na_frequencies(mydf$propcomment)
#na relative frequencies = 0.6847524

generate.cloud(mydf$propcomment, "propcomment")

#### 4.6 - Variable ransomnote 

In [None]:
relative_na_frequencies(mydf$ransomnote)
#na relative frequencies = 0.9973146

generate.cloud(mydf$ransomnote, "ransomnote")

#### 4.7 - Variable addnotes

In [None]:
relative_na_frequencies(mydf$addnotes)
#na relative frequencies = 0.8601536

generate.cloud(mydf$addnotes, "addnotes")

#### 4.8 - Variable scite1 scite2 scite3

In [None]:
relative_na_frequencies(mydf$scite1)
#na relative frequencies = 0.4230985
generate.cloud(mydf$scite1, "scite1")

relative_na_frequencies(mydf$scite2)
#na relative frequencies = 0.6098729
generate.cloud(mydf$scite2, "scite2")

relative_na_frequencies(mydf$scite3)
#na relative frequencies = 0.7822826
generate.cloud(mydf$scite3, "scite3")

#### 4.9 - Variable corp2

In [None]:
relative_na_frequencies(mydf$corp2)
#na relative frequencies = 0.9497359

generate.cloud(mydf$corp2, "corp2")

#### 4.10 - Variable target2

In [None]:
relative_na_frequencies(mydf$target2)
#na relative frequencies = 0.943976

generate.cloud(mydf$target2, "target2")

#### 4.11 - Variable corp3

In [None]:
relative_na_frequencies(mydf$corp3)
#na relative frequencies = 0.9950119

generate.cloud(mydf$corp3, "corp3")

#### 4.11 - Variable target3

In [None]:
relative_na_frequencies(mydf$target3)
#na relative frequencies = 0.9940614

generate.cloud(mydf$target3, "target3")