In [123]:
import pandas as pd
import numpy as np
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, strip_numeric, stem_text, strip_punctuation


In [30]:
data = pd.read_csv('poliblogs2008.csv')

### textProcessor()
Function that takes a vector of raw texts and performs basic operations. Uses the tm-package for these operations.

Input is designed as a spreadsheet, where each document is in a single cell. 
- Stemming (snowballStemmer:: SnowballC)
- Sparsity and Stopword Removal 
- Empty Document removal 
- Specified Metadata

1. Strips text and removes all non-characters. 
2. Builds corpus
3. Converts to lower case
4. removes (custom) punctuation
5. removes (custom) stopwords
6. removing number 
7. stemming
8. assigns metadata
9. creates output (document term matrix)

In [35]:
# Stopwords
data.documents = data.documents.apply(remove_stopwords)

In [36]:
# Remove numbers
data.documents = data.documents.apply(strip_numeric)

In [43]:
# Punctuation
data.documents = data.documents.apply(strip_punctuation)

In [41]:
# Stemming
data.documents = data.documents.apply(stem_text)

In [44]:
data.documents

0        week fal statements  lies  dismiss apologies  ...
1        honestli don t know party  caucu result plai t...
2        stand aw will troop iraq sacrif themselv natio...
3        page recent said goodbi global warming  ironic...
4        report enemi control inform battlefield falluj...
                               ...                        
13241    check thi new mailer   forward wai dem oper   ...
13242    here  intern discuss insid hillari campaign th...
13243    edward senior advi joe trippi ha theori clinto...
13244    thi interesting  huffington post blockbust sco...
13245    fox new report jame carvil paul begala re ent ...
Name: documents, Length: 13246, dtype: object

## metadata: Additional data about the documents
Specifically a dataframe or matrix object with number of rows equal to the number of documents and one column per meta-data type. The column names are used to label the metadata.  The metadata do not affect the text processing, but providing the metadata object insures that if documents are dropped the corresponding metadata rows are dropped as well.

In [55]:
# topical content covariate: rating
# topical prevalence covariate: blog
metadata = ['rating', 'blog'] 
meta = data.loc[:,metadata]

## Document-Term-Matrix
- documents: documents are stored as indexed word counts
- vocab: indexed word vocabulary

**Note**: While the output for the stm() function in R is required to be a Document-Term-Matrix, gensim relies on the bag-of-words representation of text. This might be a difference between the R implementation and the Python implementation. It has to be evaluated, whether the dtm-representation should be retained or being replaced with the BoW-representation. If so, adjustments to the algorithm are required.

In [156]:
from gensim.utils import tokenize, simple_preprocess
from gensim import corpora

In [157]:
doc_tokens = [simple_preprocess(doc) for doc in data.documents]

In [158]:
dictionary = corpora.Dictionary(doc_tokens)
# Vocabulary
dictionary[0]
vocab = dictionary.id2token
# Documents
documents = [dictionary.doc2bow(doc) for doc in doc_tokens]

To compute the baseline probabilities $m$, the vector of word-counts by document needs to be unlisted and then normalized. Hence $\textit{documents}$, need to be transformed to a vector of indexed counts with length $V$.

In [161]:
flat_documents = [item for sublist in documents for item in sublist]

In [174]:
m = []
total_sum = sum(n for _, n in flat_documents)

for elem in flat_documents: 
    m.append(elem[1] / total_sum)
    
# drop words with less than 2 occurrences

m = np.log(m) - np.log(np.mean(m)) #logit of m