In [4]:
import json
import random
from keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Conv2D
import numpy
from gensim.models.fasttext import load_facebook_model
from sklearn.model_selection import train_test_split
from features import Features

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import TweetTokenizer
from nltk import tokenize



lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()


trained_model = None
f = Features()
f.test()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


features setup correctly


In [5]:

global fasttext_model

discourse = ['other', 'agreement', 'announcement', 'appreciation', 'humor', 'answer', 'elaboration', 'negativereaction',
             'question', 'disagreement']


In [6]:
def load_fasttext():
    global fasttext_model
    fasttext_model = load_facebook_model('wiki-news-300d-1M-subword.bin')

def get_data(filename):
    load_json_data = []
    count = 0
    with open(filename) as jsonfile:
        for line in jsonfile:
            jline = json.loads(line)
            load_json_data.append(jline)
            count += 1
    return load_json_data

In [20]:
def process_data(load_data):
    global errors
    global f
    count = 0
    count_no_author = 0
    count_no_title = 0
    process_data_list = []
    process_label_list = []
    for jline in load_data:
        author = None
        if 'author' in jline['posts'][0]:
            author = jline['posts'][0]['author']
        for post in jline['posts']:
            try:
                # Structure
                features = f.getStructureFeatures(jline, post['id'])
                # Content
                if 'body' in post:
                    features.append(fasttext_Vec(post['body']))
                else:
                    features.append(numpy.zeros(300))
                    count_no_title += 1
                    
                # ge66t the vector for the parent body
                features.append(fasttext_Vec(f.getParentBody(jline, post['id'])))
                # Author
                features.append(f.isSameAuthor(jline, post))
                
                if 'author' in post:
                    features.append(fasttext_Vec(post['author']))
                    
                    if author == post['author']:
                        features.append(numpy.full(300, 1.0))
                    else:
                        features.append(numpy.full(300, 0.0))
                else:
                    features.append(numpy.zeros(300))
                    features.append(numpy.zeros(300))
                    count_no_author += 1
                
                
                
                if 'title' in jline:
                    features.append(fasttext_Vec(jline['title']))
                else:
                    features.append(numpy.zeros(300))
                    
                
                    
                    
                    
                # Community
                if 'subreddit' in jline:
                    features.append(fasttext_Vec(jline['subreddit']))
                else:
                    features.append(numpy.zeros(300))
                    
                # Thread
                features += f.thread_info(jline)
                feature_nparr = numpy.array(features)
                label = discourse.index(post['majority_type'])
                process_label_list.append([label])
                process_data_list.append(feature_nparr)
            except Exception as e:
                count += 1
    print("Total exception count: " + str(count))
    print("No authors: " + str(count_no_author))
    print("No titles: " + str(count_no_title))
    process_data_list = numpy.array(process_data_list)
    process_label_list = numpy.array(process_label_list)
    print(process_data_list.shape)
    print(process_label_list.shape)
    print("done processing")
    return process_data_list, process_label_list

In [21]:

def set_model(data):
    global trained_model
    input_shape = data[0].shape
    # After trying a bunch of different methods, this one worked the best
    t_model = tf.keras.Sequential([
        
        Bidirectional(LSTM(100, input_shape=input_shape, return_sequences=True)),
        Bidirectional(LSTM(100)),
        #Bidirectional(LSTM(64)),
        Dropout(0.5),
        #tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    t_model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['categorical_accuracy'])
    trained_model = t_model

def train(data, labels):
    global trained_model
    checkpoint_path = 'training_1/cp.ckpt'
    set_model(data)
    cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1)
    trained_model.fit(data, to_categorical(labels), validation_split=0.1, epochs=10, batch_size=64, callbacks=[cp_callback])

In [9]:
def fasttext_Vec(body):
    global fasttext_model
    global lemmatizer
    global tokenizer
    tokens = tokenizer.tokenize(body)
    output = numpy.zeros(300)
    for token in tokens:
        try:
            output = numpy.add(output, fasttext_model[lemmatizer.lemmatize(token)])
        except KeyError:
            output = numpy.add(output, numpy.zeros(300))
    return output


In [10]:
global data
global labels

load_fasttext()
json_data = get_data("coarse_discourse_dump_reddit.jsonlist")
data, labels = process_data(json_data)
train(data, labels)


  if __name__ == '__main__':


Total exception count: 14718
No authors: 13841
No titles: 0
(101639, 16, 300)
(101639, 1)
done processing
Train on 81311 samples, validate on 20328 samples
Epoch 1/5
Epoch 00001: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 2/5
Epoch 00002: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/5
Epoch 00003: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 4/5
Epoch 00004: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 5/5
Epoch 00005: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.


In [None]:
json_data = get_data("coarse_discourse_dump_reddit.jsonlist")
data, labels = process_data(json_data)
train(data, labels)

  if __name__ == '__main__':


In [22]:
train(data, labels)

Train on 91475 samples, validate on 10164 samples
Epoch 1/10
Epoch 00001: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 2/10
Epoch 00002: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/10
Epoch 00003: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 4/10
Epoch 00004: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 5/10
Epoch 00005: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 6/10
Epoch 00006: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 7/10
Epoch 00007: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 8/10
Epoch 00008: saving model to training_1/cp.ckpt

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 9/10
 5120/91475 [>.....

KeyboardInterrupt: 

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))

In [45]:
def process_body(load_data):
    global vectorizer
    global f
    count = 0
    body = []
    for jline in load_data:
        for post in jline['posts']:
            try:
                body.append(post['body'])
            except Exception as e:
                count += 1
    print(count)
    print(len(body))
    print("done processing")
    X = vectorizer.fit_transform(body)
    print(len(vectorizer.get_feature_names()))
    a = vectorizer.transform(["I love you"])
    print(a.toarray()[0].shape)
    
    return X

In [46]:
process_body(json_data)

2364
113993
done processing
1423081
(1423081,)


<113993x1423081 sparse matrix of type '<class 'numpy.int64'>'
	with 4480868 stored elements in Compressed Sparse Row format>

In [2]:
fasttext_model['hi']

NameError: name 'fasttext_model' is not defined