#### CLPsych19 Shared Task
For questions contact Michelle.Morales@ibm.com

#### Load data and libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from preprocess import *
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# Optional steps depending on where your scripts and data are
# os.getcwd()
os.chdir('clpsych19_training_data/') # point directory to training data folder

In [None]:
# Data for Task A
label_data = pd.read_csv('crowd_train.csv')
subreddit_data = pd.read_csv('task_A_train.posts.csv')
text_data = pd.read_csv('shared_task_posts.csv')

In [None]:
# Merge dataframes
sub_label_data = pd.merge(subreddit_data, label_data, how = 'left', on = 'user_id')
data = pd.merge(sub_label_data, text_data, on = ['post_id', 'user_id'])

#### Preprocessing

In [None]:
# Preprocess
data = data.fillna('')
join_title_and_body(data)
data['text'] = data.apply(lambda x: to_lower_case(x['text']), axis=1)
# data['text'] = data.apply(lambda x: remove_punc(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_(x['text']), axis=1)

# Transform df from post to user level
text_by_user = data.groupby(['user_id'])['text'].apply(lambda x: ' '.join(x)).reset_index()
text_df = pd.merge(text_by_user, label_data, how = 'left', on = 'user_id')
text_df['tokens'] = text_df.apply(lambda x: tokenize(x['text']), axis=1) # Tokenize text
text_df['tokens'] = text_df.apply(lambda x: lemmatize(x['tokens']), axis=1) # Lemmatize tokens
text_df['text'] = text_df['tokens'].str.join(' ')

##### Load in word vectors

In [None]:
wv_model = Word2Vec.load('word2vec_reddit.model')

In [None]:
# Retrofit them using code from: https://github.com/mfaruqui/retrofitting
retro_model = KeyedVectors.load_word2vec_format('retro_word2vec_reddit.txt')

In [None]:
try:
    retro_model['suicide']
except KeyError:
    print(KeyError)

In [None]:
def tokens2wordvec(tokens, model):
    vecs = []
    for t in tokens:
        try:
            wv = model[t]
            vecs.append(wv)
        except KeyError:
            continue
    if vecs == []:
        avg_vec = [0]*100
    elif vecs != []:
        avg_vec = np.mean(vecs, axis=0)
    return avg_vec        

In [None]:
text_df['word2vec'] = text_df.apply(lambda x: tokens2wordvec(x['tokens'], wv_model), axis=1)

In [None]:
text_df['retro_word2vec'] = text_df.apply(lambda x: tokens2wordvec(x['tokens'], retro_model), axis=1)

In [None]:
text_df.head()

#### Baseline (SVM) system

In [None]:
# count_vect = CountVectorizer(stop_words='english', analyzer='word') # System 1
count_vect = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=.1, max_df=.8) # System 2
X_train_counts = count_vect.fit_transform(text_df['text'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Already scaled between 0-1 no need to scale for SVM

In [None]:
X = X_train_tfidf
y =  text_df['raw_label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000, stratify=y)
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, predicted))
print(metrics.accuracy_score(y_test, predicted))
print(metrics.precision_recall_fscore_support(y_test, predicted, average='macro'))

#### CNN system

In [None]:
X = X_train_tfidf
one_hot = pd.get_dummies(text_df['raw_label'])
target_labels = one_hot.columns
target = one_hot.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.20, random_state=1000, stratify=target)

In [None]:
input_dim = X_train.shape[1]  # Number of features
cnn_model = Sequential()
cnn_model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
cnn_model.add(layers.Dense(4, activation='sigmoid'))

In [None]:
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

In [None]:
history = cnn_model.fit(X_train, y_train,
                    epochs=30,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
loss, accuracy = cnn_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = cnn_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Evaluate F1-score
y_pred = cnn_model.predict_classes(X_test)
transformed_y_test = [l.tolist().index(1) for l in y_test]
metrics.f1_score(y_pred, transformed_y_test, average='macro')
metrics.precision_recall_fscore_support(y_pred, transformed_y_test, average='macro')

#### Plot loss and accuracy to help with optimization

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)