In [2]:
## The quanteda package: quantitative analysis of textual data

In [3]:
packageVersion("quanteda")

[1] '1.3.4'

# Basic R objects and commands

## Vectors

In [4]:
num_vec <- c(1,5,6,3)

In [5]:
print(num_vec)

[1] 1 5 6 3


In [6]:
char_vec <- c("apple", "banana", "mandarin", "melon")

In [7]:
print(char_vec)

[1] "apple"    "banana"   "mandarin" "melon"   


In [8]:
print(num_vec[1])

[1] 1


In [9]:
print(num_vec[2:4])

[1] 5 6 3


In [10]:
print(char_vec[c(1,3)])

[1] "apple"    "mandarin"


In [11]:
num_vec2 <- num_vec * 2

In [12]:
print(num_vec2)

[1]  2 10 12  6


In [13]:
char_vec2 <- paste(c("red","yellow","orange","green"),char_vec)

In [14]:
print(char_vec2)

[1] "red apple"       "yellow banana"   "orange mandarin" "green melon"    


In [15]:
names(num_vec) <- char_vec

In [16]:
print(num_vec)

   apple   banana mandarin    melon 
       1        5        6        3 


## Dataframes

A data frame combines multiple vectors to construct a dataset.

In [17]:
fruit_df <- data.frame(name = char_vec, count = num_vec)

print(fruit_df)

             name count
apple       apple     1
banana     banana     5
mandarin mandarin     6
melon       melon     3


In [18]:
fruit_df2 <- data.frame(name = char_vec, count = num_vec)

In [19]:
row.names(fruit_df2) <- NULL

In [20]:
print(fruit_df2)

      name count
1    apple     1
2   banana     5
3 mandarin     6
4    melon     3


In [22]:
print(nrow(fruit_df))

[1] 4


In [23]:
print(ncol(fruit_df))

[1] 2


In [24]:
fruit_df3 <- subset(fruit_df, count>=5)

In [26]:
print(fruit_df3)

             name count
banana     banana     5
mandarin mandarin     6


## Matrices

In [27]:
mat <- matrix(c(1,3,6,8,3,5,2,7),nrow = 2)

In [29]:
print(mat)

     [,1] [,2] [,3] [,4]
[1,]    1    6    3    2
[2,]    3    8    5    7


In [30]:
colnames(mat) <- char_vec

In [31]:
print(mat)

     apple banana mandarin melon
[1,]     1      6        3     2
[2,]     3      8        5     7


In [32]:
rownames(mat) <- c("bag 1", "bag 2")

In [33]:
print(mat)

      apple banana mandarin melon
bag 1     1      6        3     2
bag 2     3      8        5     7


In [34]:
print(dim(mat))

[1] 2 4


In [35]:
print(mat["bag 1", "apple"])

[1] 1


In [36]:
print(mat["bag 2", ])

   apple   banana mandarin    melon 
       3        8        5        7 


In [37]:
print(colSums(mat))

   apple   banana mandarin    melon 
       4       14        8        9 


In [38]:
print(rowSums(mat))

bag 1 bag 2 
   12    23 


# Data Import

How to import various types of text data into R

In [39]:
require(quanteda)
require(readtext)

Loading required package: quanteda
Package version: 1.3.4
Parallel computing: 2 of 4 threads used.
See https://quanteda.io for tutorials and examples.

Attaching package: 'quanteda'

The following object is masked from 'jupyter:irkernel':

    View

The following object is masked from 'package:utils':

    View

Loading required package: readtext


In [40]:
data_dir <- system.file("extdata/", package = "readtext")

In [41]:
data_dir

In [50]:
checking <- system.file("html/", package = "readtext")

In [51]:
checking

In [45]:
inaug_data1 <- read.csv(paste0(data_dir, "/csv/inaugCorpus.csv"))

In [46]:
class(inaug_data1)

In [47]:
inaug_data <- readtext(paste0(data_dir, "/tsv/dailsample.tsv"), text_field = "speech")

In [48]:
class(inaug_data)

## Multiple Text Files

In [55]:
data_dir <- system.file("extdata/", package = "readtext")

In [56]:
data_dir

In [58]:
udhr_data <- readtext(paste0(data_dir, "/txt/UDHR/*"))

In [59]:
class(udhr_data)

In [61]:
eu_data <- readtext(paste0(data_dir, "/txt/EU_manifestos/*.txt"),
                    docvarsfrom = "filenames", 
                    docvarnames = c("unit", "context", "year", "language", "party"),
                    dvsep = "_", 
                    encoding = "ISO-8859-1")
str(eu_data)

Classes 'readtext' and 'data.frame':	17 obs. of  7 variables:
 $ doc_id  : chr  "EU_euro_2004_de_PSE.txt" "EU_euro_2004_de_V.txt" "EU_euro_2004_en_PSE.txt" "EU_euro_2004_en_V.txt" ...
 $ text    : chr  "PES · PSE · SPE European Parliament rue Wiertz B 1047 Brussels\n\nGEMEINSAM WERDEN WIR STÄRKER Fünf Verpflichtu"| __truncated__ "Gemeinsames Manifest\nGemeinsames Manifest zur Europawahl 2004 Europäischen Föderation Grüner Parteien (EFGP) \"| __truncated__ "PES · PSE · SPE European Parliament rue Wiertz B 1047 Brussels\n\nGROWING STRONGER TOGETHER Five commitments fo"| __truncated__ "Manifesto\nEuropean Elections Manifesto 2004\nCOMMON PREAMBLE\nAs adopted at 15th EFGP Council, Luxembourg, 8th"| __truncated__ ...
 $ unit    : chr  "EU" "EU" "EU" "EU" ...
 $ context : chr  "euro" "euro" "euro" "euro" ...
 $ year    : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
 $ language: chr  "de" "de" "en" "en" ...
 $ party   : chr  "PSE" "V" "PSE" "V" ...


In [62]:
## reading files from sub-directories

data_reviews <- readtext(paste0(data_dir, "/txt/movie_reviews/*"))

In [65]:
##JSON data

twitter_data <- readtext("content/data/twitter.json", source = "twitter")

"number of items to replace is not a multiple of replacement length"

In [67]:
print(head(names(twitter_data)))

[1] "doc_id"         "text"           "retweet_count"  "favorite_count"
[5] "favorited"      "truncated"     


# Basic Operations

Basic operations in **Quanteda**

quanteda has 3 basic types of objects. 

<ol>
    <li>Corpus</li>
    <li>Tokens</li>
    <li>Document-feature matrix (DFM)</li>
</ol>

## Corpus

Constructing and modifying a corpus

### Character vector

In [68]:
data_char_ukimmig2010

In [69]:
class(data_char_ukimmig2010)

In [70]:
names(data_char_ukimmig2010)

In [71]:
length(data_char_ukimmig2010)

In [72]:
immig_corp <- corpus(data_char_ukimmig2010,
                    docvars = data.frame(party = names(data_char_ukimmig2010)))

In [73]:
class(immig_corp)

In [74]:
summary(immig_corp)

Text,Types,Tokens,Sentences,party
BNP,1125,3280,88,BNP
Coalition,142,260,4,Coalition
Conservative,251,499,15,Conservative
Greens,322,679,21,Greens
Labour,298,683,29,Labour
LibDem,251,483,14,LibDem
PC,77,114,5,PC
SNP,88,134,4,SNP
UKIP,346,723,27,UKIP


In [82]:
## A character vector consisting of one document per element

character_vec <- c(A = "The quick brown fox jumps over the lazy dog.",
 B = "Who are we talking to?",
 C = "What are we really selling?")

In [78]:
character_vec

In [79]:
character_corp <- corpus(character_vec,
                        docvars = data.frame(variables = names(character_vec)))

In [81]:
summary(character_corp)

Text,Types,Tokens,Sentences,variables
A,10,10,1,A
B,6,6,1,B
C,6,6,1,C


### Dataframe

In [83]:
data_dir <- system.file("extdata/", package = "readtext")

In [84]:
inaug_data <- readtext(paste0(data_dir, "/csv/inaugCorpus.csv"), text_field = "texts")

In [86]:
print(names(inaug_data))

[1] "doc_id"    "text"      "Year"      "President" "FirstName"


In [87]:
class(inaug_data)

In [88]:
inaug_corp <- corpus(inaug_data)

In [89]:
summary(inaug_corp, 5)

Text,Types,Tokens,Sentences,Year,President,FirstName
inaugCorpus.csv.1,625,1540,23,1789,Washington,George
inaugCorpus.csv.2,96,147,4,1793,Washington,George
inaugCorpus.csv.3,826,2578,37,1797,Adams,John
inaugCorpus.csv.4,717,1927,41,1801,Jefferson,Thomas
inaugCorpus.csv.5,804,2381,45,1805,Jefferson,Thomas
