In [20]:
import pandas as pd
import numpy as np
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import (remove_stopwords,
                                          preprocess_string,
                                          strip_numeric,
                                          stem_text,
                                          strip_punctuation)

In [21]:
data = pd.read_csv('../data/poliblogs2008.csv')

### textProcessor()
Function that takes a vector of raw texts and performs basic operations. Uses the tm-package for these operations.

Input is designed as a spreadsheet, where each document is in a single cell. 
- Stemming (snowballStemmer:: SnowballC)
- Sparsity and Stopword Removal 
- Empty Document removal 
- Specified Metadata

1. Strips text and removes all non-characters. 
2. Builds corpus
3. Converts to lower case
4. removes (custom) punctuation
5. removes (custom) stopwords
6. removing number 
7. stemming
8. assigns metadata
9. creates output (document term matrix)

In [22]:
# Selection 
data = data[3000:4000]

In [23]:
# Stopwords
data.documents = data.documents.apply(remove_stopwords)

In [24]:
# Remove numbers
data.documents = data.documents.apply(strip_numeric)

In [25]:
# Punctuation
data.documents = data.documents.apply(strip_punctuation)

In [26]:
# Stemming
data.documents = data.documents.apply(stem_text)

## metadata: Additional data about the documents
Specifically a dataframe or matrix object with number of rows equal to the number of documents and one column per meta-data type. The column names are used to label the metadata.  The metadata do not affect the text processing, but providing the metadata object insures that if documents are dropped the corresponding metadata rows are dropped as well.

In [27]:
# topical content covariate: rating
# topical prevalence covariate: blog
metadata = ['rating', 'blog'] 
meta = data.loc[:,metadata]

## Document-Term-Matrix
- documents: documents are stored as indexed word counts
- vocab: indexed word vocabulary

**Note**: While the output for the stm() function in R is required to be a Document-Term-Matrix, gensim relies on the bag-of-words representation of text. This might be a difference between the R implementation and the Python implementation. It has to be evaluated, whether the dtm-representation should be retained or being replaced with the BoW-representation. If so, adjustments to the algorithm are required.

In [33]:
from gensim.utils import simple_preprocess
from gensim import corpora

In [29]:
# create dictionary and corpus
doc_tokens = [simple_preprocess(doc) for doc in data.documents]
dct = corpora.Dictionary(doc_tokens)
corpus = [dct.doc2bow(doc) for doc in doc_tokens]


# save objects
dct.save('../data/dictionary')
corpora.MmCorpus.serialize('../data/corpus.mm', corpus)