##### Load datasets into Pandas.

In [1]:
import pandas as pd

anorexiaSubreddits = pd.read_csv("data/subreddits_anorexia.csv", encoding='ISO-8859-1')
obesitySubreddits = pd.read_csv("data/subreddits_obesity.csv", encoding='ISO-8859-1')
bothSubreddits = pd.read_csv("data/subreddits_both.csv", encoding='ISO-8859-1')

##### Extract authors for each class (use hashes instead of usernames to protect privacy).

In [17]:
import hashlib

anorexia_authors = anorexiaSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
obesity_authors = obesitySubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
both_authors = bothSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()


In [37]:
anorexia_authors

Unnamed: 0,author
0,2e3cea450d14a67fac90de804c3984e0
1,9c02696b2e66a443afca156e917e03eb
2,87774818e97b0deba1982e3cf1d2d2e7
3,4391f26dc3b679280b2d091960f1f73c
4,ce035158b46aed8af92168dd6fa32ffa
5,8581029be6405d1b3ecef3632692e62c
6,e9ba1031abaf154477fc0cc7398f41ae
7,587403827f5e9e1f4bd6d5b0557bdf84
8,9cfc99d6833b4ed13124ebb620ad427a
9,f367ade7a1ac204ff49fb16f00738e5b


In [None]:
from tqdm import tqdm

csv_filename = '../../data_full_preprocessed.csv'
chunksize = 10000
count = 0
obesity_author_data_frames = []
anorexia_author_data_frames = []
both_author_data_frames = []
for chunk in tqdm(pd.read_csv(csv_filename, chunksize=chunksize)):
    chunk['author'] = chunk['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest())
    anorexia_df = anorexia_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not anorexia_df.empty:
        anorexia_author_data_frames.append(anorexia_df)
        
    obesity_df = obesity_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not obesity_df.empty:
        obesity_author_data_frames.append(obesity_df)
        
    both_df = both_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not obesity_df.empty:
        both_author_data_frames.append(both_df)
    count += 1
print('Total # chunks processed: {}.'.format(count))

pd.concat(anorexia_author_data_frames).to_csv('data/anorexia_author_data.csv', index=False)
pd.concat(obesity_author_data_frames).to_csv('data/obesity_author_data.csv', index=False)
pd.concat(both_author_data_frames).to_csv('data/both_author_data.csv', index=False)
    


0it [00:00, ?it/s][A
1it [00:00,  1.91it/s][A
2it [00:00,  2.34it/s][A
3it [00:00,  2.78it/s][A
4it [00:01,  3.12it/s][A
5it [00:01,  3.26it/s][A
3636it [07:29, 11.03it/s]


Total # chunks processed: 3636.


##### Sample the data.

In [7]:
import pandas as pd

anorexia_author_data = pd.read_csv('data/anorexia_author_data.csv', encoding='ISO-8859-1') 
obesity_author_data = pd.read_csv('data/obesity_author_data.csv', encoding='ISO-8859-1')
both_author_data = pd.read_csv('data/both_author_data.csv', encoding='ISO-8859-1')

In [40]:
anorexia_author_data

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,2e3cea450d14a67fac90de804c3984e0,Chapter 83 is not considered canon anymore. Mi...,Berserk,t5_2rru6,1
1,2e3cea450d14a67fac90de804c3984e0,"""Anorexia survivor"". How many people do actual...",TumblrInAction,t5_2vizz,2
2,9cfc99d6833b4ed13124ebb620ad427a,"oh no! I love her, I hope it's just rumors :(",MakeupAddiction,t5_2rww2,2
3,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,mildlyinfuriating,t5_2ubgg,2
4,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,PuertoRico,t5_2qjyb,1
5,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,travel,t5_2qh41,1
6,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,Opeth,t5_2qoi2,1
7,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,rangers,t5_2rnpb,1
8,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,coolguides,t5_310rm,1
9,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,recipes,t5_2qh56,1


In [41]:
obesity_author_data

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,d9ccb6eaa68d1b3ea3dd432e48c6bfff,I'M pissed that the dancing girl fat girl got ...,RagenChastain,t5_323a3,4
1,24654918653efa65253028b1a8474c61,Well..when someone is obese its obvious when y...,TumblrInAction,t5_2vizz,1
2,f259124ebfbfa451037cfe9639ca73c6,"For the last 100,000 years of humanity obesity...",sex,t5_2qh3p,5
3,e4ed7d00769cb2ecc997d94c60d5dcd3,The EU courts now says that obesity is a disab...,videos,t5_2qh1e,1
4,87ad772f3a32b632f54c6739f29b6ac8,"Holy shit, which coach of ours invented that?!",CFB,t5_2qm9d,1
5,87ad772f3a32b632f54c6739f29b6ac8,"I mean, I've grown up on the mediocre Gophers ...",CFB,t5_2qm9d,1
6,4dd9c2e57c6358437b3a9bdd57b0d6e7,"Depth is very questionable, you maybe hit para...",Fitness,t5_2qhx4,3
7,09cb3c204828ede2196452cf1fe87c59,"Combined with my username? No doubt. ""Reddit s...",fatlogic,t5_2wyxm,3
8,09cb3c204828ede2196452cf1fe87c59,That house. I can't get past the mess they liv...,fatlogic,t5_2wyxm,9
9,ce67b36bd91730391e0a322660c8cc55,So do I... I don't want a 400 pound FA.,fatlogic,t5_2wyxm,4


In [42]:
both_author_data

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,24654918653efa65253028b1a8474c61,Well..when someone is obese its obvious when y...,TumblrInAction,t5_2vizz,1
1,8b0d6fbd30e0beeab6189e26bdd67e45,"&gt;Clean eating means not overly processed, t...",Fitness,t5_2qhx4,7
2,0db37b1e34902f5f93c5499c0fe8b9a8,"""You're gaining weight!""\rBecause you were a G...",raisedbynarcissists,t5_2we9n,9
3,f259124ebfbfa451037cfe9639ca73c6,"For the last 100,000 years of humanity obesity...",sex,t5_2qh3p,5
4,e4ed7d00769cb2ecc997d94c60d5dcd3,The EU courts now says that obesity is a disab...,videos,t5_2qh1e,1
5,b700a5c2d5e6b6c184d9bcff0ea24246,"OOR: Hope you don't mind, but Cecil is gonna j...",XMenRP,t5_31uz5,1
6,c20b290a557c5c2494ee8181a174a82a,Just because it's not a strong reaction doesn'...,science,t5_mouw,6
7,29e738a4ec427c605f17ae21af885f73,"I read a great comment recently, the message o...",worldnews,t5_2qh13,1
8,29e738a4ec427c605f17ae21af885f73,Not always. They might ( through 3'rd parties ...,worldnews,t5_2qh13,2
9,29e738a4ec427c605f17ae21af885f73,"LOL. ""Yeah I burned my passport, joined an ins...",worldnews,t5_2qh13,9


##### Feature extraction/Model selection pipeline

Based heavily on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

In [8]:
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [10]:
from sklearn import preprocessing
categories = list(pd.concat([anorexia_author_data, obesity_author_data, both_author_data]).drop_duplicates(subset='subreddit')['subreddit'].values)

le = preprocessing.LabelEncoder()
le.fit(categories)
print(le.transform(['Economics']))
#list(le.classes_)

[20]


In [12]:
a = anorexia_author_data.groupby('subreddit') 
anorexia_author_data_filtered = a.filter(lambda x: len(x) > 2)  #  HAVING COUNT(*) > 1
#print(anorexia_author_data.as_matrix(columns=['body']).shape)
#anorexia_author_data.as_matrix(columns=['subreddit'])

In [22]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(anorexia_author_data_filtered['body'].values.tolist(), 
                    anorexia_author_data_filtered['subreddit'].values.tolist())
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__n_iter': (10, 50, 80),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:    3.8s


done in 18.267s

Best score: 0.529
Best parameters set:
	clf__alpha: 1e-06
	clf__n_iter: 80
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__max_features: None
	vect__ngram_range: (1, 1)


[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed:   18.1s finished
