##### Load datasets into Pandas.

In [1]:
import pandas as pd

anorexiaSubreddits = pd.read_csv("data/subreddits_anorexia.csv", encoding='ISO-8859-1')
obesitySubreddits = pd.read_csv("data/subreddits_obesity.csv", encoding='ISO-8859-1')
bothSubreddits = pd.read_csv("data/subreddits_both.csv", encoding='ISO-8859-1')

##### Extract authors for each class (use hashes instead of usernames to protect privacy).

In [2]:
import hashlib

anorexia_authors = anorexiaSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
obesity_authors = obesitySubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
both_authors = bothSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()


In [3]:
len(anorexia_authors)

1300

In [4]:
from tqdm import tqdm

csv_filename = '../../data_full_preprocessed.csv'
chunksize = 10000
count = 0
obesity_author_data_frames = []
anorexia_author_data_frames = []
neither_author_data_frames = []

anorexia_record_count = 0
obesity_record_count = 0
neither_record_count = 0

for chunk in tqdm(pd.read_csv(csv_filename, chunksize=chunksize)):
    chunk['author'] = chunk['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest())
    anorexia_df = anorexia_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if anorexia_record_count < 10000 and not anorexia_df.empty:
        anorexia_author_data_frames.append(anorexia_df)
        anorexia_record_count += len(anorexia_df)
        
    obesity_df = obesity_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if obesity_record_count < 10000 and not obesity_df.empty:
        obesity_author_data_frames.append(obesity_df)
        obesity_record_count += len(obesity_df)
        
    # Use an outer join to get comments from users who have not posted about anorexia/obesity.
    neither_df = chunk.join(both_authors, on='author', how='outer', lsuffix='_left', rsuffix='_right')
    neither_df = neither_df[neither_df['author_right'].isnull()]
    if neither_record_count < 10000 and not neither_df.empty:
        neither_author_data_frames.append(neither_df)
        neither_record_count += len(neither_df)
        
    count += 1
    if anorexia_record_count > 10000 and obesity_record_count > 10000 and neither_record_count > 10000:
        break
print('Total # chunks processed: {}.'.format(count))

pd.concat(anorexia_author_data_frames).to_csv('data/anorexia_author_data.csv', index=False)
pd.concat(obesity_author_data_frames).to_csv('data/obesity_author_data.csv', index=False)
pd.concat(neither_author_data_frames).to_csv('data/neither_author_data.csv', index=False)
    

3636it [05:42, 11.82it/s]


Total # chunks processed: 3636.


In [5]:
neither_author_data_frames

[                                author                       author_left  \
 0     49d264a69d92ec57c908cdb64cb30931  49d264a69d92ec57c908cdb64cb30931   
 1     897ed4b79ba31c80057c87183b1cdb6e  897ed4b79ba31c80057c87183b1cdb6e   
 2668  897ed4b79ba31c80057c87183b1cdb6e  897ed4b79ba31c80057c87183b1cdb6e   
 2     f0d692685eeb47e5a22c164823c62295  f0d692685eeb47e5a22c164823c62295   
 6971  f0d692685eeb47e5a22c164823c62295  f0d692685eeb47e5a22c164823c62295   
 3     5e76a511987b00def6195687a3277fce  5e76a511987b00def6195687a3277fce   
 4     0878fce284db4a1bbe9ffc633ec2c4fa  0878fce284db4a1bbe9ffc633ec2c4fa   
 5     a4b6d677c71075a8a02908f2da08960e  a4b6d677c71075a8a02908f2da08960e   
 6     e0676b7efcc36f86969db914b333f2be  e0676b7efcc36f86969db914b333f2be   
 2330  e0676b7efcc36f86969db914b333f2be  e0676b7efcc36f86969db914b333f2be   
 8845  e0676b7efcc36f86969db914b333f2be  e0676b7efcc36f86969db914b333f2be   
 7     e14b0b4146749b77015bc7424f9d8c20  e14b0b4146749b77015bc7424f9d8c20   

##### Sample the data.

In [3]:
import pandas as pd

anorexia_author_data = pd.read_csv('data/anorexia_author_data.csv', encoding='ISO-8859-1') 
obesity_author_data = pd.read_csv('data/obesity_author_data.csv', encoding='ISO-8859-1')
neither_author_data = pd.read_csv('data/neither_author_data.csv', encoding='ISO-8859-1')

anorexia_author_data.insert(len(anorexia_author_data.columns), 'category', 'anorexia')
obesity_author_data.insert(len(obesity_author_data.columns), 'category', 'obesity')
neither_author_data.insert(len(neither_author_data.columns), 'category', 'neither')

# Each dataset has ~10K rows so split into training and test sets of 5000 rows each.
anorexia_author_data_train = anorexia_author_data.head(5000)
anorexia_author_data_test = anorexia_author_data.head(5000)
obesity_author_data_train = obesity_author_data.head(5000)
obesity_author_data_test = obesity_author_data.tail(5000)
neither_author_data_train = neither_author_data.head(5000)
neither_author_data_test = neither_author_data.tail(5000)

train_data = pd.concat([anorexia_author_data_train, obesity_author_data_train, neither_author_data_train])
test_data = pd.concat([anorexia_author_data_test, obesity_author_data_test, neither_author_data_test])

##### Feature extraction/Model selection pipeline

Based heavily on:
*   http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
*   http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html

In [4]:
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [5]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_data['body'].values.tolist(), 
                    train_data['category'].values.tolist())
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters_a = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters_a[param_name]))
    
    y_true, y_pred = test_data['category'].values.tolist(), grid_search.predict(test_data['body'].values.tolist()) 
    
    print(classification_report(y_true, y_pred))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.0min finished


done in 188.983s

Best score: 0.423
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)
             precision    recall  f1-score   support

   anorexia       0.63      0.92      0.75      5000
    neither       0.62      0.51      0.56      5000
    obesity       0.52      0.38      0.44      5000

avg / total       0.59      0.60      0.58     15000

