In [124]:
# Import the necessary packages
import numpy as np
import pandas as pd
import random
import scipy.stats
import scipy.special
import nltk
import math
nltk.download('stopwords')
nltk.download('punkt')

# Load the text library
txtLibraryPath = '/Users/Nikki/Dropbox/UNC/Causal NLP/Reback_TxtLibrary/Reback_Project Tech Support Text Message Library_NF.xlsx'
txtLibrary = pd.read_excel(txtLibraryPath, 
                           sheet_name = "Library",
                          skiprows = 23)

[nltk_data] Downloading package stopwords to /Users/Nikki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Nikki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Preparation

## Tidy up the text message library 

* Remove the skipped lines
* Rename the columns

In [125]:
txtLibrary.columns = ['txtID', 'txt']
txtLibrary = txtLibrary.dropna()

## Pre-processing

* Make all words lower case
* Remove numbers
* Remove single character words (i.e., "a", "i", "n")
* Remove stop words
* Stemming

In [126]:
# Lower case
txtLibrary.txt = txtLibrary.txt.str.lower()

# Remove single character words
txtLibrary.txt = txtLibrary.txt.str.replace('\\b[a-z]\\b', "")

# Remove numbers
txtLibrary.txt = txtLibrary.txt.str.replace('[0-9]', "").tolist()

# Remove punctuation
txtLibrary.txt = txtLibrary.txt.str.replace('[^\\w\\s]', "")

# Remove the fill-in-the-blank blanks
txtLibrary.txt = txtLibrary.txt.str.replace('[_]+', "")

# Remove extra white space
txtLibrary.txt = txtLibrary.txt.str.replace('[\\s]+', " ")

# Make a list containing the text message strings
txtMsgStringList = txtLibrary.txt.tolist()

# Tokenize
txtMsgTokens = [nltk.tokenize.word_tokenize(x) for x in txtMsgStringList]

# Remove stop words
stop_words  = set(nltk.corpus.stopwords.words('english'))
txtMsgTokens = [[w for w in text if not w in stop_words] for text in txtMsgTokens]

# Stem tokens
porter = nltk.stem.porter.PorterStemmer()
txtMsgTokensStemmed = [[porter.stem(word) for word in text] for text in txtMsgTokens]

## Create the TF-IDF matrix

In [127]:
# Flatten the token list
txtMsgTokensStemmed_flat = [y for x in txtMsgTokensStemmed for y in x]

# Convert the flattened token list into an array
txtMsgTokensStemmed_flatArray = np.array(txtMsgTokensStemmed_flat)

# Get the unique stemmed tokens
uniqueStemmedTokens = np.unique(txtMsgTokensStemmed_flatArray)

# Count the total number of unique stemmed tokens
N_stemmedTokens = uniqueStemmedTokens.size

# Create an list of text message arrays
txtMsgTokensStemmed_array = [np.array(x) for x in txtMsgTokensStemmed]

# Count up the frequency of each stemmed word in the texts
countOfStemmedTokensInTextMsg = []
for x in txtMsgTokensStemmed_array:
    countOfStemmedTokensInTextMsg.append([sum(x == y) for y in uniqueStemmedTokens])
        
# Count up the number of stemmed tokens in each text message
N_stemmedTokensInTextMsg = [len(x) for x in txtMsgTokensStemmed]

# Term frequency
# This is tf
tf = []
for x, y in zip(countOfStemmedTokensInTextMsg, N_stemmedTokensInTextMsg):
    tf.append([z/y for z in x])

# Count of the number of occurrences of each word in the corpus (count of number of texts with word)
# This is df = document frequency
occurrenceOfStemmedTokensInCorpus = []
counter = 0
for x in uniqueStemmedTokens:
    for y in txtMsgTokensStemmed_array:
        if(sum(y == x) > 0): 
            counter += 1
    occurrenceOfStemmedTokensInCorpus.append(counter)
    counter = 0
    
# Add 1 to occurrence counts
occurrenceOfStemmedTokensInCorpus_smoothedArray = np.array(occurrenceOfStemmedTokensInCorpus) +1

# Number of documents in the coprus
N_textMessages = len(txtMsgTokensStemmed)

# IDF = log(N/(df + 1))
idf = np.log(N_textMessages/occurrenceOfStemmedTokensInCorpus_smoothedArray)

# Calculate TF-IDF
tf_idf = []
for x in tf:
    tf_idf.append(x*idf)
    
tf_idf_array = np.vstack(tf_idf)

## Create the participants and their outcomes

### Parameters for ppt creation

* Set the seed
* R is the number of participants
* `mu_true` is the list of true means for the outcomes
* `sigma_true` is a list of the true sds for the outcomes

In [128]:
# Set the seed
random.seed(1001)

# Number of participants
R = len(txtLibrary.txtID)

# Extract the true topics
txtLibrary['topic'] = txtLibrary.txtID.astype(str).str[0]

# True means and standard deviations
trueTopicList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'P']
trueMeansList = [-20, -15, -10, -5, 0, 5, 10, 15, 20]
trueSDsList = [1, 1, 1, 1, 1, 1, 1, 1, 1]
trueTopicsAndMeansDict = dict(zip(trueTopicList, trueMeansList))
trueTopicsAndSDsDict = dict(zip(trueTopicList, trueSDsList))
txtLibrary['true_mean'] = txtLibrary.topic.map(trueTopicsAndMeansDict)
txtLibrary['sigma_true'] = txtLibrary.topic.map(trueTopicsAndSDsDict)

### Create participant outcomes

In [129]:
txtLibrary['Y'] = 0
txtLibrary = txtLibrary.set_index('txtID')
    
for txtid in txtLibrary.index:
    txtLibrary.at[txtid, 'Y'] = np.random.normal(txtLibrary.at[txtid, 'true_mean'], 
                                                 txtLibrary.at[txtid, 'sigma_true'])

                                                      txt topic  true_mean  \
txtID                                                                        
A1a001   meth will mess with your hiv be good to yourself     A        -20   
A1a002  some strains are drug resistant you are worth ...     A        -20   
A1a003                        poz partying not good combo     A        -20   
A1a004  poz using not good cocktail you deserve to be ...     A        -20   
A1a005  tina ain your friend if she makes you forget y...     A        -20   
...                                                   ...   ...        ...   
Post1            hi just reminder your followup visit is      P         20   
Post2   hi your followup visit is couple weeks away it...     P         20   
Post3   hi hope all is well your appt is scheduled for...     P         20   
Post4                  your followup appt is see you then     P         20   
Post5   your appt is coming up it set for call us if t...     P 