Short text topic modeling and double machine learning for causal inference and NLP

In [8]:
# Import the necessary packages
import numpy as np
import pandas as pd
import random
import scipy.stats
import scipy.special
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Load the text library
txtLibraryPath = '/Users/Nikki/Dropbox/UNC/Causal NLP/Reback_TxtLibrary/Reback_Project Tech Support Text Message Library_NF.xlsx'
txtLibrary = pd.read_excel(txtLibraryPath, 
                           sheet_name = "Library",
                          skiprows = 23)

[nltk_data] Downloading package stopwords to /Users/Nikki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Nikki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Preparation

# Tidy up the text message library 

* Remove the skipped lines
* Rename the columns

In [9]:
txtLibrary.columns = ['txtID', 'txt']
txtLibrary = txtLibrary.dropna()

# Pre-processing

* Make all words lower case
* Remove numbers
* Remove single character words (i.e., "a", "i", "n")
* Remove stop words
* Stemming

In [10]:
# Lower case
txtLibrary.txt = txtLibrary.txt.str.lower()

# Remove single character words
txtLibrary.txt = txtLibrary.txt.str.replace('\\b[a-z]\\b', "")

# Remove numbers
txtLibrary.txt = txtLibrary.txt.str.replace('[0-9]', "").tolist()

# Remove punctuation
txtLibrary.txt = txtLibrary.txt.str.replace('[^\\w\\s]', "")

# Remove the fill-in-the-blank blanks
txtLibrary.txt = txtLibrary.txt.str.replace('[_]+', "")

# Remove extra white space
txtLibrary.txt = txtLibrary.txt.str.replace('[\\s]+', " ")

# Make a list containing the text message strings
txtMsgStringList = txtLibrary.txt.tolist()

# Tokenize
txtMsgTokens = [nltk.tokenize.word_tokenize(x) for x in txtMsgStringList]

# Remove stop words
stop_words  = set(nltk.corpus.stopwords.words('english'))
txtMsgTokens = [[w for w in text if not w in stop_words] for text in txtMsgTokens]

# Stem tokens
porter = nltk.stem.porter.PorterStemmer()
txtMsgTokensStemmed = [[porter.stem(word) for word in text] for text in txtMsgTokens]

## Create the participants and their outcomes

### Parameters for ppt creation

* Set the seed
* R is the number of participants
* `mu_true` is the list of true means for the outcomes
* `sigma_true` is a list of the true sds for the outcomes

In [11]:
# Set the seed
random.seed(1001)

# Number of participants
R = len(txtLibrary.txtID)

# Extract the true topics
txtLibrary['topic'] = txtLibrary.txtID.astype(str).str[0]

# True means and standard deviations
trueTopicList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'P']
trueMeansList = [-20, -15, -10, -5, 0, 5, 10, 15, 20]
trueSDsList = [1, 1, 1, 1, 1, 1, 1, 1, 1]
trueTopicsAndMeansDict = dict(zip(trueTopicList, trueMeansList))
trueTopicsAndSDsDict = dict(zip(trueTopicList, trueSDsList))
txtLibrary['true_mean'] = txtLibrary.topic.map(trueTopicsAndMeansDict)
txtLibrary['sigma_true'] = txtLibrary.topic.map(trueTopicsAndSDsDict)

### Create participant outcomes

In [12]:
txtLibrary['Y'] = 0
txtLibrary = txtLibrary.set_index('txtID')
    
for txtid in txtLibrary.index:
    txtLibrary.at[txtid, 'Y'] = np.random.normal(txtLibrary.at[txtid, 'true_mean'], 
                                                 txtLibrary.at[txtid, 'sigma_true'])

## Create the folds

There are 660 text messages. For five-fold cross validation, we'll have 132 text messages in each fold

In [13]:
folds = np.random.choice(txtLibrary.index, size = (5, 132), replace = False)

fold1IDs = np.array(folds[0])
fold2IDs = np.array(folds[1])
fold3IDs = np.array(folds[2])
fold4IDs = np.array(folds[3])
fold5IDs = np.array(folds[4])


## Prepare for training the model

### Put the document labels and tokens in a sparse-matrix-like structure

In [14]:
# Message IDs
msgIDs = [[txtLibrary.index[i] for j in txtMsgTokensStemmed[i]] for i in range(0, len(txtLibrary.index))]
msgIDsLong = [y for x in msgIDs for y in x]
msgIDsArray = np.array(msgIDsLong)

# Text Messages
txtTokensLong = [y for x in txtMsgTokensStemmed for y in x]
txtTokensArray = np.array(txtTokensLong)

## Create the training sets

In [15]:
def createTrainingSet(holdOutSetIDs, msgIDsArray, txtTokensArray):
    inTrainingLogical = np.logical_not(np.isin(msgIDsArray, holdOutSetIDs))
    trainingTokens = txtTokensArray[inTrainingLogical]
    trainingIDsLong = msgIDsArray[inTrainingLogical]
    return((trainingTokens, trainingIDsLong))

## Topic Modeling

### The Gibbs sampling algorithm

We have the following terms:


* $m_z$ is the number of documents in cluster $z$
* $n_z$ is the number of words in cluster $z$
* $n_z^w$ is the number of occurrences of word w in cluster $z$
* $N_d$ is the number of words in document $d$
* $N_d^w$ is the number of occurrences of word $w$ in document $d$
* $V$ is the number of words in the vocabulary
* $D$ is the number of documents in the corpus
* $m_k = \sum_{d = 1}^K \mathbb{1}(z_d = k)$
* $n_k^w = \sum_{d = 1}^D \mathbb{1}(z_d = k) N_d^w$
* $m_{k, -d} = $ number of documents in $z$ without considering document $d$
* $n_{k, -d}^w = \sum_{d \ne d^ast} \mathbb{1}(z_d = k) N_d^k = $ number of occurrences of $w$ in $k$ without considering document $d$
* $n_{k, -d} = $ number of words in $k$ without considering document $d$

The Gibbs update equation is given by 

\begin{align*}
    P(z_{d^\ast} = \tilde{k} \vert z_{-d^\ast}, w, \alpha, \beta) &\propto \frac{\Pi_{v = 1}^V \Pi_{i = 1}^{N_{d^\ast}^v} n_{\tilde{k}, -d^\ast}^{v} + \beta_{\tilde{k}, v} + i - 1}{\Pi_{i = 1}^{N_{d^\ast}} n_{\tilde{k}, -d^\ast} + \sum_v \beta_v + i -1} \times \frac{m_{\tilde{k}, -d^\ast} + \alpha_{\tilde{k}}}{D - 1 + \sum_k \alpha_k} \times P(y \vert topic = \tilde{k})
\end{align*}

where

\begin{align*}
    P(y \vert topic) \sim N(\beta_1 + \beta_2 \mathbb{1}(topic = 2) + \cdots + \beta_K \mathbb{1}(topic = K), \sigma^2).
\end{align*}


The parameters and latent variables to keep track of are the $z_d$s and $\beta$s. The update for the supervision can be done via block Gibbs sampling. Chapter 14.2 gives the algorithm for drawing from the posterior for normal regression withthe standard noninformative prior distribution.

### Algorithm as described in the paper

In [16]:
# Training set is an array of arrays
# Inner arrays are an array of each document's tokens (after stemming)
uniqueMsgIDs = np.unique(msgIDsArray)
trainingSetIDs = set(uniqueMsgIDs) - set(fold1IDs)
trainingSetIDs = np.array(list(trainingSetIDs))
txtMsgTokensStemmedInArrays = [np.array(x) for x in txtMsgTokensStemmed]
txtMsgTokensStemmedInArrays = np.array(txtMsgTokensStemmedInArrays)
trainingSetLogical = np.isin(uniqueMsgIDs, trainingSetIDs)
trainingSetLogical = trainingSetLogical.tolist()
trainingSet = txtMsgTokensStemmedInArrays[trainingSetLogical]

In [19]:
# Algorithm parameters, hyperparameters
K = 8
uniqueTokens = np.unique(txtTokensArray)
V = len(uniqueTokens)
D = len(trainingSet)
z = np.zeros(D)
L = 1000 # Number of samples to draw
delta = np.array([1 for i in range(len(uniqueTokens))])
alpha = np.array([1 for i in range(K)])

# Initial values
m_zs = np.zeros(K)
n_zs = np.zeros(K)
n_z__ws = np.zeros(shape = (K, V))

for d in range(1, D+1):
    # Draw an initial topic
    current_z = np.random.choice(range(1, K+1), 1)
    
    # Store the topic in z
    z[d-1] = current_z
    
    # Update m_z
    m_zs[current_z - 1] = m_zs[current_z - 1] + 1
    
    # Update n_z
    n_zs[current_z - 1] = n_zs[current_z - 1] + len(trainingSet[d-1])
    
    # Get N_d
    N_d = len(trainingSet[d-1])
    
    # Update n_z__w
    for w in np.unique(trainingSet[d-1]):
        wordNum = np.where(w == uniqueTokens)
        wordNum = wordNum[0]
        n_z__ws[current_z - 1, wordNum] = n_z__ws[current_z - 1, wordNum] + sum(trainingSet[d-1] == w)
zChain = [z]
   
        
for l in range(L):
    
    for d in range(1, D+1):
        # Record the current cluster of d
        z_current = int(z[d-1])
        m_zs[z_current - 1] = m_zs[z_current - 1] - 1
        n_zs[z_current - 1] = n_zs[z_current - 1] - len(trainingSet[d-1])
        
        for w in np.unique(trainingSet[d-1]):
            
            wordNum = np.where(w == uniqueTokens)
            wordNum = wordNum[0]
            n_z__ws[z_current -1, wordNum] = max(n_z__ws[z_current -1, wordNum] - len(trainingSet[d-1] == w), 0)

        
        # Calculate the sampling probabilities
        probs = np.zeros(shape = K)
        for ktilde in range(1, K+1):
            
            a_num = 1
            for token in np.unique(trainingSet[d-1]):
                wordNum = np.where(token == uniqueTokens)
                wordNum = wordNum[0]
                
                for j in range(1, sum(trainingSet[d-1] == token) + 1):
                    a_num = a_num * (n_z__ws[ktilde - 1, wordNum] + delta[wordNum] + j - 1)
                    
            a_denom = 1
            for i in range(1, len(trainingSet[d-1])+1):
                a_denom = (n_zs[ktilde - 1] + sum(delta) + i -1)
                
            a = a_num/a_denom
            
            b_num = max(m_zs[ktilde-1], 0) + alpha[ktilde-1]
            
            b_denom = D - 1 + sum(alpha)
            
            b = b_num/b_denom
            
            probs[ktilde - 1] = a*b
        
        # Draw a new topic
        new_z = np.random.choice(range(1, K+1), p = probs/sum(probs))
        
        z[d-1] = new_z
        
        
        m_zs[new_z - 1] = m_zs[new_z - 1] + 1
        n_zs[new_z - 1] = n_zs[new_z - 1] + len(trainingSet[d-1])
        
        for w in np.unique(trainingSet[d-1]):
            
            wordNum = np.where(w == uniqueTokens)
            wordNum = wordNum[0]
            n_z__ws[new_z -1, wordNum] = n_z__ws[new_z -1, wordNum] + len(trainingSet[d-1] == w)
    
    zChain.append(np.array(z))
        
        
        
            
        
        
    




In [20]:
zChainDf = pd.DataFrame(zChain)
zChainDf.columns = trainingSetIDs
zChainDf.to_csv('zchain.csv')

 ### Use the posterior predictive to predict topics for the test set

## Estimate the causal estimands