In [1]:
import pandas as pd
import string
import re
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.models import Sequential, load_model
from keras.preprocessing import sequence
import numpy as np
import random
import io
import json
import pickle


Using TensorFlow backend.


# Functions used

In [2]:
# filters dataframe to only contain tweets from specified usernames
def filter_by_username(depressed, username):
    columns = ['likes', 'replies', 'retweets', 'text', 'timestamp', 'user']
    filtered = pd.DataFrame(columns=columns)
    for index, row in depressed.iterrows():
        temp = {}
        if row['user'] in usernames:
            temp['likes'] = row['likes']
            temp['replies'] = row['replies']
            temp['retweets'] = row['retweets']
            temp['text'] = row['text']
            temp['timestamp'] = row ['timestamp']
            temp['user'] = row ['user']
        
            filtered = filtered.append(temp, ignore_index=True)
    filtered['likes'] = filtered['likes'].astype(int)
    filtered['retweets'] = filtered['retweets'].astype(int)
    filtered['replies'] = filtered['replies'].astype(int)
    
    return filtered

# generates batches of training data and respective labels
def generator(depressed, nondepressed, batch_size=128):
    maxlen=15
    depressed_ct = 0
    nondepressed_ct = 0
    while 1:
        # depressed is 1, nondepressed 0
        class_ = random.randint(0,1)
        if class_ == 0:
            rows = nondepressed[nondepressed_ct:nondepressed_ct + batch_size]
            nondepressed_ct += batch_size
            if(nondepressed_ct >= len(nondepressed)):
                nondepressed_ct = nondepressed_ct - len(nondepressed)
                rows = rows + nondepressed[:nondepressed_ct]
            labels = np.zeros((batch_size,1))
        else:
            rows = depressed[depressed_ct:depressed_ct+batch_size]
            depressed_ct += batch_size
            if(depressed_ct >= len(depressed)):
                depressed_ct = depressed_ct - len(depressed)
                rows = rows + depressed[:depressed_ct]
            labels = np.ones((batch_size,1))
            
        train = sequence.pad_sequences(rows,maxlen=maxlen)
        #print("\ntrain: %i label: %i class: %i" % (len(train), len(labels), class_))
        yield train, labels
        
# extracts indivifual tweets from dataframe, 
# parses them to contain only text,
# and returns them in a list
def create_tweets_lists(dataframe):
    sentences = []
    for index, row in dataframe[['text']].iterrows():
        text = str(row['text'])
        text = re.sub(r'http:\S+', '', text)
        text = re.sub(r'https\S+', '', text)
        text = re.sub(r'www.\S+', '', text)
        text = re.sub(r'pic.twitter\S+', '', text)
        text = re.sub(r'[^\w\s]', '', text).strip().lower()
        if len(text) > 0:
            sentences.append(text)
    return sentences

# takes a dataframe and portions it into 
# training, validation, and testing portions
def create_train_val_and_test(df):
    users = df['user'].unique()
    train = len(users) // 2
    test = len(users) * 3 // 4
    np.random.shuffle(users)
    train_users = users[:train]
    val_users = users[train:test]
    test_users = users[test:]
    train = pd.DataFrame()
    val = pd.DataFrame()
    test = pd.DataFrame()
    for user, df_user in df.groupby('user'):
        if user in train_users:
            train = pd.concat([train, df_user], ignore_index=True)
        elif user in val_users:
            val = pd.concat([val, df_user], ignore_index=True)
        else:
            test = pd.concat([test, df_user], ignore_index=True)
    
    return train, val, test

# takes a dataframe containing test data.
# iterates through dataframe by user and 
# predicts value for every tweet. whichever
# value has most frequency is depression 
# prediction for the user. total depressed
# and non depressed counts are returned.
# (depressed is 1, nondepressed is 0)
def predict(model, df):
    dep = 0
    nondep = 0
    for user, df_user in df.groupby('user'):
        tweets = create_tweets_lists(df_user)
        sequences = tokenizer.texts_to_sequences(tweets)
        test = sequence.pad_sequences(sequences, maxlen=15)
        preds = model.predict_classes(test)
        preds = preds.T[0]
        prediction = np.bincount(preds).argmax()
        if prediction == 1:
            dep += 1
        else:
            nondep += 1
            
    return dep, nondep

# Getting and formatting data

In [3]:
depressed = pd.read_csv("depressedTweets.csv")
non_depressed = pd.read_csv('non_depressed_tweets.csv')

In [4]:
dep_train, dep_val, dep_test = create_train_val_and_test(depressed)
non_train, non_val, non_test = create_train_val_and_test(non_depressed)

In [8]:
dep_train = create_tweets_lists(dep_train)
dep_val = create_tweets_lists(dep_val)
non_train = create_tweets_lists(non_train)
non_val = create_tweets_lists(non_val)

In [13]:
all_tweets = dep_train + non_train + dep_val + non_val
b1 = len(dep_train)
b2 = b1 + len(non_train)
b3 = b2 + len(dep_val)
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(all_tweets)
sequences = tokenizer.texts_to_sequences(all_tweets)
d_train = sequences[:b1]
n_train = sequences[b1:b2]
d_val = sequences[b2:b3]
n_val = sequences[b3:]
np.random.shuffle(d_train)
np.random.shuffle(n_train)
np.random.shuffle(d_val)
np.random.shuffle(n_val)

# Training

In [14]:
train_gen = generator(d_train, n_train)
val_gen = generator(d_val, n_val)

In [15]:
embeds = len(tokenizer.word_index)
model = Sequential()
model.add(layers.Embedding(embeds, 64))
model.add(layers.LSTM(32, return_sequences=True))
model.add(layers.LSTM(32))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

In [16]:
model.fit_generator(
    train_gen,
    steps_per_epoch=200,
    epochs=15,
    validation_data=val_gen,
    validation_steps=50)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0xb39d7bdd8>

# Predicting

In [9]:
model = load_model('95_percent.h5')
dep_test = pd.read_csv('depressedTest.csv')
non_test = pd.read_csv('nondepressedTest.csv')
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [10]:
dep_size = len(dep_test.groupby('user'))
non_size = len(non_test.groupby('user'))

dep_count, nondep_count = predict(model, dep_test)
dep_acc = dep_count / dep_size
dep_count, nondep_count = predict(model, non_test)
non_acc = dep_count / non_size

total_size = dep_size + non_size
dep_weight = dep_size / total_size
non_weight = non_size / total_size

weight_acc = (dep_acc * 100 * dep_weight) + (non_acc * 100 * non_weight)
print(dep_acc)
print(non_acc)
print(weight_acc)

1.0
0.95
97.5


In [18]:
model.save('95_percent.h5')

In [26]:
dep_test.to_csv('depressedTest.csv')
non_test.to_csv('nondepressedTest.csv')

In [23]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [25]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)