-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_terms.Rd
48 lines (40 loc) · 2.05 KB
/
get_terms.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_terms.R
\name{get_terms}
\alias{get_terms}
\title{get_terms function}
\usage{
get_terms(corpus_dates, ntrms_words, st, path.name, ntrms_collocation,
ngrams_number, min_freq, language)
}
\arguments{
\item{corpus_dates}{a character vector indicating the subfolders where the texts are located.}
\item{ntrms_words}{maximum numbers of words that will be filtered by tf-idf. We rank the word by tf-idf in a decreasing order. Then, we select the words with the ntrms highest tf-idf.}
\item{st}{set 0 to stem the words and 1 otherwise.}
\item{path.name}{the folders path where the subfolders with the dates are located.}
\item{ntrms_collocation}{maximum numbers of collocations that will be filtered by tf-idf. We rank the collocations by tf-idf in a decreasing order. Then, after we select the words with the ntrms highest tf-idf.}
\item{ngrams_number}{integer indicating the size of the collocations. Defaults to 2, indicating to compute bigrams. If set to 3, will find collocations of bigrams and trigrams.}
\item{min_freq}{integer indicating the frequency of how many times a collocation should at least occur in the data in order to be returned.}
\item{language}{the texts language. Default is english.}
}
\value{
a list containing a sparse matrix with the all collocations and words couting and another with a tf-idf filtered collocations and words counting according to the ntrms.
}
\description{
get_terms function
}
\examples{
\donttest{
st_year=2017
end_year=2018
path_name=system.file("news",package="TextForecast")
qt=paste0(sort(rep(seq(from=st_year,to=end_year,by=1),12)),
c("m1","m2","m3","m4","m5","m6","m7","m8","m9","m10","m11","m12"))
z_terms=get_terms(corpus_dates=qt[1:23],path.name=path_name,
ntrms_words=500,ngrams_number=3,st=0,ntrms_collocation=500,min_freq=10)
}
path_name=system.file("news",package="TextForecast")
days=c("2019-30-01","2019-31-01")
z_terms=get_terms(corpus_dates=days[1],path.name=path_name,
ntrms_words=500,ngrams_number=3,st=0,ntrms_collocation=500,min_freq=1)
}