##### Load datasets into Pandas.

In [1]:
import pandas as pd

anorexiaSubreddits = pd.read_csv("data/subreddits_anorexia.csv", encoding='ISO-8859-1')
obesitySubreddits = pd.read_csv("data/subreddits_obesity.csv", encoding='ISO-8859-1')
bothSubreddits = pd.read_csv("data/subreddits_both.csv", encoding='ISO-8859-1')

##### Extract authors for each class (use hashes instead of usernames to protect privacy).

In [17]:
import hashlib

anorexia_authors = anorexiaSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
obesity_authors = obesitySubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()
both_authors = bothSubreddits.drop_duplicates(subset="author")['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest()).to_frame()


In [37]:
anorexia_authors

Unnamed: 0,author
0,2e3cea450d14a67fac90de804c3984e0
1,9c02696b2e66a443afca156e917e03eb
2,87774818e97b0deba1982e3cf1d2d2e7
3,4391f26dc3b679280b2d091960f1f73c
4,ce035158b46aed8af92168dd6fa32ffa
5,8581029be6405d1b3ecef3632692e62c
6,e9ba1031abaf154477fc0cc7398f41ae
7,587403827f5e9e1f4bd6d5b0557bdf84
8,9cfc99d6833b4ed13124ebb620ad427a
9,f367ade7a1ac204ff49fb16f00738e5b


In [None]:
from tqdm import tqdm

csv_filename = '../../data_full_preprocessed.csv'
chunksize = 10000
count = 0
obesity_author_data_frames = []
anorexia_author_data_frames = []
both_author_data_frames = []
for chunk in tqdm(pd.read_csv(csv_filename, chunksize=chunksize)):
    chunk['author'] = chunk['author'].apply(lambda a : hashlib.md5(a.encode()).hexdigest())
    anorexia_df = anorexia_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not anorexia_df.empty:
        anorexia_author_data_frames.append(anorexia_df)
        
    obesity_df = obesity_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not obesity_df.empty:
        obesity_author_data_frames.append(obesity_df)
        
    both_df = both_authors.join(chunk.set_index('author'), on='author', how='inner', lsuffix='_left', rsuffix='_right')
    if not obesity_df.empty:
        both_author_data_frames.append(both_df)
    count += 1
print('Total # chunks processed: {}.'.format(count))

pd.concat(anorexia_author_data_frames).to_csv('data/anorexia_author_data.csv', index=False)
pd.concat(obesity_author_data_frames).to_csv('data/obesity_author_data.csv', index=False)
pd.concat(both_author_data_frames).to_csv('data/both_author_data.csv', index=False)
    


0it [00:00, ?it/s][A
1it [00:00,  1.91it/s][A
2it [00:00,  2.34it/s][A
3it [00:00,  2.78it/s][A
4it [00:01,  3.12it/s][A
5it [00:01,  3.26it/s][A
3636it [07:29, 11.03it/s]


Total # chunks processed: 3636.


In [2]:
import pandas as pd

anorexia_author_data = pd.read_csv('data/anorexia_author_data.csv', encoding='ISO-8859-1') 
obesity_author_data = pd.read_csv('data/obesity_author_data.csv', encoding='ISO-8859-1')
both_author_data = pd.read_csv('data/both_author_data.csv', encoding='ISO-8859-1')

In [9]:
anorexia_author_data.head()

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,2e3cea450d14a67fac90de804c3984e0,Chapter 83 is not considered canon anymore. Mi...,Berserk,t5_2rru6,1
1,2e3cea450d14a67fac90de804c3984e0,"""Anorexia survivor"". How many people do actual...",TumblrInAction,t5_2vizz,2
2,9cfc99d6833b4ed13124ebb620ad427a,"oh no! I love her, I hope it's just rumors :(",MakeupAddiction,t5_2rww2,2
3,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,mildlyinfuriating,t5_2ubgg,2
4,8f7b54aad11cd635bc5ebb2c4e6cbcac,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,PuertoRico,t5_2qjyb,1


In [10]:
obesity_author_data.head()

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,d9ccb6eaa68d1b3ea3dd432e48c6bfff,I'M pissed that the dancing girl fat girl got ...,RagenChastain,t5_323a3,4
1,24654918653efa65253028b1a8474c61,Well..when someone is obese its obvious when y...,TumblrInAction,t5_2vizz,1
2,f259124ebfbfa451037cfe9639ca73c6,"For the last 100,000 years of humanity obesity...",sex,t5_2qh3p,5
3,e4ed7d00769cb2ecc997d94c60d5dcd3,The EU courts now says that obesity is a disab...,videos,t5_2qh1e,1
4,87ad772f3a32b632f54c6739f29b6ac8,"Holy shit, which coach of ours invented that?!",CFB,t5_2qm9d,1


In [11]:
both_author_data.head()

Unnamed: 0,author,body,subreddit,subreddit_id,score
0,24654918653efa65253028b1a8474c61,Well..when someone is obese its obvious when y...,TumblrInAction,t5_2vizz,1
1,8b0d6fbd30e0beeab6189e26bdd67e45,"&gt;Clean eating means not overly processed, t...",Fitness,t5_2qhx4,7
2,0db37b1e34902f5f93c5499c0fe8b9a8,"""You're gaining weight!""\rBecause you were a G...",raisedbynarcissists,t5_2we9n,9
3,f259124ebfbfa451037cfe9639ca73c6,"For the last 100,000 years of humanity obesity...",sex,t5_2qh3p,5
4,e4ed7d00769cb2ecc997d94c60d5dcd3,The EU courts now says that obesity is a disab...,videos,t5_2qh1e,1
