In [4]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import re


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,  Conv1D, GlobalMaxPooling1D, Activation
from keras.layers.embeddings import Embedding


In [5]:
#Validation set manually annotated 
Validationdata = pd.read_csv('data/ds.tsv',delimiter="\t")

#Set of posts samples (positive/control) filtered by their topical similarity with a depression taxonomy (Ht)
HT_data = pd.read_csv('data/ts_ht.tsv',delimiter="\t")

In [15]:

X_train = HT_data['pp_text']
X_test = Validationdata['pp_text']
y_train = HT_data['label']
y_test = Validationdata['label']
y_test

0      1
1      1
2      1
3      1
4      1
      ..
143    0
144    0
145    0
146    0
147    0
Name: label, Length: 148, dtype: int64

In [17]:
# defining function to clean text and retrive closs-validation datasets
def cleantxt(txt):
    """
    Cleans the string passed. Cleaning Includes-
    1. remove special characters/symbols
    2. convert text to lower-case
    3. retain only alphabets
    4. remove words less than 3 characters
    5. remove stop-words
    """  
    # collecting english stop words from nltk-library
    stpw = stopwords.words('english')
    
    # Adding custom stop-words
    stpw.extend(['www','http','utc'])
    stpw = set(stpw)
    
    # using regex to clean the text
    txt = re.sub("[\<\[].*?[\>\]]", " ", txt)
    txt = txt.lower()
    txt = re.sub(r"[^a-z ]", " ", txt)
    txt = " ".join([x for x in txt.split() if x not in stpw])
    return txt



for text in X_train:
    cleantxt(text)
    
for text in X_test:
    cleantxt(text)

In [18]:
   
tokenizer = Tokenizer(num_words=6948)

tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)

tokenizer.fit_on_texts(X_test)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)


maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)



X_val = X_t[:3000]
X_trainn = X_t[3000:]

y_val = y_train[:3000]
y_trainn = y_train[3000:]


In [9]:
max_features = 100000
batch_size = 128
epochs = 5
num_classes=5
embed_size = 128
num_unique_word = 6948

In [10]:

modelCNN = Sequential()
modelCNN.add(Embedding(num_unique_word, embed_size))
modelCNN.add(Conv1D(64,kernel_size=2,padding='same',activation='relu',strides=1))
modelCNN.add(GlobalMaxPooling1D())
modelCNN.add(Dense(1, activation="relu"))


modelCNN.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

modelCNN.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         889344    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 64)          16448     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 905,857
Trainable params: 905,857
Non-trainable params: 0
_________________________________________________________________


In [11]:
hist = modelCNN.fit(X_trainn,y_trainn, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
results = modelCNN.evaluate(X_test,Validationdata['label'])



In [25]:

predictedsvm = modelCNN.predict_classes(X_test)
predictedsvm = predictedsvm[:, 0]
accuracy_score_sig = metrics.accuracy_score(predictedsvm, y_test)
accuracy_score_auc = metrics.roc_auc_score(predictedsvm, y_test)
accuracy_score_priv = metrics.precision_score(predictedsvm, y_test)
accuracy_score_recall = metrics.recall_score(predictedsvm, y_test)

print(accuracy_score_priv)
print(accuracy_score_recall)
print(accuracy_score_auc)
accuracy_score_sig

0.8727272727272727
0.6075949367088608
0.753072830673271


0.7432432432432432