Clustering of textual documents with time window
-
Install cargo (see cargo documentation).
-
Install stories
cargo install --git https://github.com/medialab/stories.git
stories vocab my_file.csv --ngrams 2 > my_vocab.csv
WINDOW=`stories window my_file.csv --raw`
stories nn my_vocab.csv my_file.csv -w $WINDOW --ngrams 2 --threshold 0.65 > nn.csv
xsv join --left id my_file.csv id nn.csv | xsv select id,created_at,nearest_neighbor,thread_id,distance > nn_dated.csv
stories eval my_labels.csv nn_dated.csv --datecol created_at