In [1]:
import csv
import re

from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist

import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
NEG_CONTRACTIONS = [
    (r'aren\'t', 'are not'),
    (r'can\'t', 'can not'),
    (r'couldn\'t', 'could not'),
    (r'daren\'t', 'dare not'),
    (r'didn\'t', 'did not'),
    (r'doesn\'t', 'does not'),
    (r'don\'t', 'do not'),
    (r'isn\'t', 'is not'),
    (r'hasn\'t', 'has not'),
    (r'haven\'t', 'have not'),
    (r'hadn\'t', 'had not'),
    (r'mayn\'t', 'may not'),
    (r'mightn\'t', 'might not'),
    (r'mustn\'t', 'must not'),
    (r'needn\'t', 'need not'),
    (r'oughtn\'t', 'ought not'),
    (r'shan\'t', 'shall not'),
    (r'shouldn\'t', 'should not'),
    (r'wasn\'t', 'was not'),
    (r'weren\'t', 'were not'),
    (r'won\'t', 'will not'),
    (r'wouldn\'t', 'would not'),
    (r'ain\'t', 'am not') # not only but stopword anyway
]

BLACKLIST_STOPWORDS = ['over','only','very','not','no']
ENGLISH_STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS)

OTHER_CONTRACTIONS = {
    "'m": 'am',
    "'ll": 'will',
    "'s": 'has', # or 'is' but both are stopwords
    "'d": 'had'  # or 'would' but both are stopwords
}

In [3]:
def readTrainFile(file):
    with open(file,'r') as tsvin:
        tsvin = csv.reader(tsvin,delimiter ='\t')
        parsedFile = {"label" :[],"statement" :[],"subject" :[],"speaker":[],"speakerJob":[],"stateInfo":[],"partyAffiliation":[],"context":[]}
        for rowNum,row in enumerate(tsvin):
            try:
                parsedFile["label"].append(row[0])
                parsedFile["statement"].append(row[1])
                parsedFile["subject"].append(row[2])
                parsedFile["speaker"].append(row[3])
                parsedFile["speakerJob"].append(row[4])
                parsedFile["stateInfo"].append(row[5])
                parsedFile["partyAffiliation"].append(row[6])
                parsedFile["context"].append(row[7])
            except:
                print("Few inputs are in invalid format")
                #print(rowNum)
                #print(row)

        return parsedFile

In [48]:
def readTestFile(file):
    with open(file,'r') as tsvin:
        tsvin = csv.reader(tsvin,delimiter ='\t')
        parsedFile = {"statement" :[],"subject" :[],"speaker":[],"speakerJob":[],"stateInfo":[],"partyAffiliation":[],"context":[]}
        for rowNum,row in enumerate(tsvin):
            try:
                parsedFile["statement"].append(row[0])
                parsedFile["subject"].append(row[1])
                parsedFile["speaker"].append(row[2])
                parsedFile["speakerJob"].append(row[3])
                parsedFile["stateInfo"].append(row[4])
                parsedFile["partyAffiliation"].append(row[5])
                parsedFile["context"].append(row[6])
            except:
                print("Few inputs are in invalid format")
                print(rowNum)
                print(row)

        return parsedFile

In [60]:
print(len(parsedTest["statement"]))

1283


In [5]:
# The input statement is expected a string.
def preProcessing(text,delimiter=' ',n=1):
    tokenisedOutput = []
    stemmer = PorterStemmer()
    for line in text:
        tokens = []

        ## Convert the line into lower case
        line = line.lower()

        ## Transform negative contractions
        for neg in NEG_CONTRACTIONS:
            line = re.sub(neg[0], neg[1], line)

        ## Tokenising the words
        tokens = word_tokenize(line)

        # transform other contractions (e.g 'll --> will)
        tokens = [OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token)
                  else token for token in tokens]

        # removing punctuations, only retain words, no numbers and punctuation marks.
        r = r'[a-z]+'
        tokens = [word for word in tokens if re.search(r, word)]

        # # remove irrelevant stop words
        # tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS]

        # stemming
        #tokens = [stemmer.stem(token) for token in tokens]


        ## Probably not required if using RNN for classification
        if n == 1:
            # return the list of words
            tokenisedOutput.append(tokens)
        else:
            # return the list of ngrams
            tokenisedOutput.append(ngrams(tokens, n))
        ##print(tokens)

    return tokenisedOutput

In [6]:
## Returns the indice of the statment which can be used for embedding lookup
maxSeqLength = 200

def statementIndices(text,dictionary,outputLength):
    tokenListIndices = np.zeros((outputLength,maxSeqLength))
    lineCount = 0
    tokenCount = 0
    
    for line in text:
        tokenCount = 0
        for token in line:
            try:
                tokenListIndices[lineCount][tokenCount] = dictionary[token]
            except:
                tokenListIndices[lineCount][tokenCount] = 399999
            tokenCount = tokenCount + 1
            if(tokenCount >= maxSeqLength):
                break
        lineCount = lineCount + 1

    return tokenListIndices

In [7]:
def labelVectors(labels):
    labelVectors = []
    defaultVectors = {"pants-fire":np.array([1,0,0,0,0,0]),"false":np.array([0,1,0,0,0,0]),"barely-true":np.array([0,0,1,0,0,0]),
                      "half-true":np.array([0,0,0,1,0,0]),"mostly-true":np.array([0,0,0,0,1,0]),"true":np.array([0,0,0,0,0,1])}
    for label in labels:
        labelVectors.append(defaultVectors[label])
    return np.asarray(labelVectors)

In [8]:
def loadGlove(embeddingFile):
    vocab = []
    embedding = []
    dictionary = {}
    reverseDictionary = {}
    count = 0
    file = open(embeddingFile, 'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embedding.append(row[1:])
        dictionary[row[0]] = count
        reverseDictionary[count] = row[0]
        count = count + 1
    print('Loaded GloVe!')
    file.close()
    return vocab, embedding,dictionary,reverseDictionary

In [9]:
trainingFile = "/Users/sainikhilmaram/OneDrive/UCSB courses/Winter 2018/Deep Learning/HW2/liar_dataset/train.tsv"
embeddingFile = "/Users/sainikhilmaram/Desktop/glove/glove.6B.300d.txt"
testFile = "/Users/sainikhilmaram/OneDrive/UCSB courses/Winter 2018/Deep Learning/HW2/liar_dataset/test.tsv"

In [10]:
vocab,embedding,dictionary,reverseDictionary = loadGlove(embeddingFile)
vocabSize = len(vocab)
embeddingSize = len(embedding[0]) ## 300
embedding = np.asarray(embedding)
vocab = np.asarray(vocab)

Loaded GloVe!


In [11]:
parsedTraining = readTrainFile(trainingFile)
## Tokenising the statement file
##tokenisedStatement = preProcessing(["I shouldn't,have came here at 3","I'll be the Boss."])
tokenisedStatement = preProcessing(parsedTraining["statement"])

## getting the indices of the word.
tokenisedStatementIndices = statementIndices(tokenisedStatement,dictionary,len(tokenisedStatement))
#print(tokenisedStatementIndices[0])

## Output labels are converted into vectors
outputLabelVectors = labelVectors(parsedTraining["label"])

print(len(outputLabelVectors))

Few inputs are in invalid format
Few inputs are in invalid format
10240


In [47]:
print(len(parsedTraining["statement"]))

10240


In [49]:
parsedTest = readTestFile(testFile)

In [51]:
print(len(parsedTest["statement"]))

1283


In [61]:
parsedTest = readTestFile(testFile)
tokenisedStatementTest = preProcessing(parsedTest["statement"])
## getting the indices of the word.
tokenisedStatementIndicesTest = statementIndices(tokenisedStatementTest,dictionary,len(tokenisedStatementTest))

In [13]:
print(tokenisedStatementIndices[0:10])

[[2.10000e+02 0.00000e+00 2.65469e+05 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [6.10000e+01 1.19000e+02 0.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [4.53900e+03 4.43000e+02 6.14300e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 ...
 [4.10000e+01 9.13000e+02 0.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [2.12000e+02 2.00000e+01 2.47000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [2.10000e+02 3.52100e+03 1.41700e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]


In [14]:
print(outputLabelVectors[0:10])

[[0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 0 1]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]]


In [15]:
print(vocabSize)
print(embeddingSize)
print(embedding[4736][2])

400000
300
-0.08155


In [16]:
def embeddingMatrix(sess,vocabSize,embeddingSize,embedding):
    W = tf.Variable(tf.constant(0.0, shape=[vocabSize, embeddingSize]),
                    trainable=False, name="W")
    embeddingPlaceholder = tf.placeholder(tf.float32, shape=[vocabSize, embeddingSize])
    embeddingInit = W.assign(embeddingPlaceholder)
    sess.run(embeddingInit, feed_dict={embeddingPlaceholder: embedding})
    return W

In [17]:
sess = tf.Session()

In [18]:
## returns the embedding matrix which can be used for look up
embeddingMatrixWeights = embeddingMatrix(sess,vocabSize,embeddingSize,embedding)

In [19]:
wordIndices = tf.placeholder(tf.int32,shape=[None])
embeddedWords = tf.nn.embedding_lookup(embeddingMatrixWeights,wordIndices)

print(sess.run(embeddedWords,feed_dict={wordIndices:[1,2,3,4,5]}))


[[-0.25539  -0.25723   0.13169  ... -0.2329   -0.12226   0.35499 ]
 [-0.12559   0.01363   0.10306  ... -0.34224  -0.022394  0.13684 ]
 [-0.076947 -0.021211  0.21271  ...  0.18351  -0.29183  -0.046533]
 [-0.25756  -0.057132 -0.6719   ... -0.16043   0.046744 -0.070621]
 [ 0.038466 -0.039792  0.082747 ... -0.33427   0.011807  0.059703]]


## Build the neural network

In [20]:
batchSize = 24
lstmUnits = 64
numClasses = 6
iterations = 10

In [21]:
## Place holders for input data and labels
labels = tf.placeholder(tf.float32, [None, numClasses])
input_data = tf.placeholder(tf.int32, [None, maxSeqLength])

In [22]:
#data = tf.Variable(tf.zeros([batchSize, None, embeddingSize]),dtype=tf.float32)
data = tf.nn.embedding_lookup(embeddingMatrixWeights,input_data)

In [23]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [24]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [25]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [26]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [52]:
def getTrainBatch(index,batchSize,tokenisedStatementIndices,outputLabelVectors):
    return tokenisedStatementIndices[index : index + batchSize] , np.array(outputLabelVectors[index:index+batchSize])

In [53]:
def getTestBatch(index,batchSize,tokenisedStatementIndicesTest):
    return tokenisedStatementIndicesTest[index : index + batchSize]

In [68]:
def trainModel(sess,tokenisedStatementIndices,outputLabelVectors,iterations=10):
    sess.run(tf.global_variables_initializer())
    trainingDataSize = len(tokenisedStatementIndices)
    for i in range(iterations):
        index = 0
        ## If data present is not exact multiple of batch size
        while index < trainingDataSize:
            if(index + batchSize <= trainingDataSize):
                size = batchSize
            else:
                size = trainingDataSize - index
            inputData,outputData = getTrainBatch(index,size,tokenisedStatementIndices,outputLabelVectors)
            sess.run(optimizer,feed_dict={input_data:inputData,labels:outputData})
            index = index + size

In [None]:
trainModel(sess,tokenisedStatementIndices,outputLabelVectors)

In [54]:
# sess.run(tf.global_variables_initializer())
# trainingDataSize = len(tokenisedStatementIndices)
# for i in range(1):
#     index = 0
#     ## If data present is not exact multiple of batch size
#     while index < trainingDataSize:
#         if(index + batchSize <= trainingDataSize):
#             size = batchSize
#         else:
#             size = trainingDataSize - index
#         inputData,outputData = getTrainBatch(index,size,tokenisedStatementIndices,outputLabelVectors)
#         sess.run(optimizer,feed_dict={input_data:inputData,labels:outputData})
#         index = index + size

In [56]:
correctPrediction = tf.argmax(prediction,1)

In [62]:
outputPrediction = []
testDataSize = len(tokenisedStatementIndicesTest)
for i in range(1):
    index = 0
    while index < testDataSize:
        if(index + batchSize <= testDataSize):
            size = batchSize
        else:
            size = testDataSize - index
        inputData = getTestBatch(index,batchSize,tokenisedStatementIndicesTest)
        outputPrediction.extend(sess.run(correctPrediction,feed_dict={input_data:inputData}))
        index = index + size

In [63]:
print(len(tokenisedStatementIndicesTest))

1283


In [64]:
print(len(outputPrediction))

1283


In [65]:
inputLabels = {1:"pants-fire",2:"false",3:"barely-true",4:"half-true",5:"mostly-true",6:"true"}
def saveFile(outputPrediction,fileName):
    f = open(fileName,'w')
    for i in range(len(outputPrediction)):
        s = inputLabels[outputPrediction[i]]
        s = s +"\n"
        f.write(s)
    

In [66]:
saveFile(outputPrediction,"predictions.txt")