#### CLPsych19 Shared Task
For questions contact Michelle.Morales@ibm.com

##### Import libraries

In [None]:
from preprocess import *
import pandas as pd
import os

##### Load data & preprocess

In [None]:
# Optional steps depending on where your scripts and data are
# os.getcwd()
# os.chdir('clpsych19_training_data/') # point directory to training data folder

In [None]:
# Data for Task A
label_data = pd.read_csv('crowd_train.csv')
subreddit_data = pd.read_csv('task_A_train.posts.csv')
text_data = pd.read_csv('shared_task_posts.csv')

In [None]:
# Merge dataframes
sub_label_data = pd.merge(subreddit_data, label_data, how = 'left', on = 'user_id')
data = pd.merge(sub_label_data, text_data, on = ['post_id', 'user_id'])

In [None]:
# Get info about the data
print(data.shape)
print(data.columns)
print(data['user_id'].value_counts().describe())
print(data['raw_label'].value_counts())

##### Baseline system: preprocessing data

In [None]:
# Preprocess
data = data.fillna('')
join_title_and_body(data)
data['text'] = data.apply(lambda x: to_lower_case(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_punc(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_(x['text']), axis=1)

# Transform df from post to user level
text_by_user = data.groupby(['user_id'])['text'].apply(lambda x: ' '.join(x)).reset_index()
text_df = pd.merge(text_by_user, label_data, how = 'left', on = 'user_id')
text_df['tokens'] = text_df.apply(lambda x: tokenize(x['text']), axis=1) # Tokenize text
text_df['tokens'] = text_df.apply(lambda x: lemmatize(x['tokens']), axis=1) # Lemmatize tokens
text_df['text'] = text_df['tokens'].str.join(' ')

##### Split data, build model, and evaluate

In [None]:
# Set up stratified 5 fold cross-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn import metrics

# count_vect = CountVectorizer(stop_words='english', analyzer='word') # System 1
count_vect = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=.1, max_df=.8) # System 2
X_train_counts = count_vect.fit_transform(text_df['text'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Already scaled between 0-1 no need to scale for SVM

X = X_train_tfidf
y =  text_df['raw_label']
skf = StratifiedKFold(n_splits=5, random_state=30, shuffle=False) # Make sure to use the same random state number!
skf.get_n_splits(X, y)

precision_scores = []
recall_scores = []
f1_scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = svm.SVC(kernel='linear')
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    scores = metrics.precision_recall_fscore_support(y_test, predicted, average='macro')
    precision_scores.append(scores[0])
    recall_scores.append(scores[1])
    f1_scores.append(scores[2])
    print(metrics.classification_report(y_test, predicted))

In [None]:
print("Precision = {}".format(np.mean(precision_scores)))
print("Recall = {}".format(np.mean(recall_scores)))
print("F1-score = {}".format(np.mean(f1_scores)))

In [None]:
len(count_vect.get_feature_names())