# Imports

In [7]:
from collections import defaultdict
from datetime import datetime
import json
import os
import pickle
import random

# import jsonlines
import pandas as pd 

import spacy
# nlp = spacy.load('en_core_web_sm')

# Directory paths

In [8]:
# data_directory_path   = '/Volumes/Passport-1/data/birth-control'
# output_directory_path = '/Volumes/Passport-1/output/birth-control'

data_directory_path   = '/Users/maria/Documents/data/birth-control'
output_directory_path = '/Users/maria/Documents/output/birth-control'

# Load datasets

In [9]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

dataframes = [reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df]

In [10]:
combined_df = pd.concat(dataframes)
len(combined_df)

1063672

In [11]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

In [12]:
combined_df.sample(3)

Unnamed: 0.2,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,...,date,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens,Unnamed: 0.1,Unnamed: 0.1.1,name
161105,209644,1040284630857990144,,girl I got that nexplanon thing im good for 3...,,2018,9.0,,,girl got nexplanon thing good NUMyrs lol,...,2018-09-13T17:02:38.000Z,1.040284e+18,0.0,1.0,0.0,0.0,7.0,,,
339264,424149,537776188643487744,,Contraceptive Pill Associated With Changes In ...,,2014,11.0,,,contraceptive pill associated changes brain st...,...,2014-11-27T01:13:29.000Z,5.377762e+17,0.0,0.0,0.0,0.0,10.0,,,
46566,228,d9bqwhs,1477678000.0,I read something like it can be an indicator f...,,2016,10.0,,,read something like indicator osteoporosis sta...,...,,,,,,,,,,


In [14]:
for i, r in combined_df[combined_df['source'] == 'reddit-posts'].sample(5).iterrows():
    print(' '.join(r['text'].split()))
    print()

Hi all, I've been on the 35mg Cilique combined pill for 3 years. Today I went to the doctors to renew my prescription. She told me there are new guidelines which have been recommended to avoid pregnancy as the 7 day break increases your risk of pregnancy. So I should only take a 4 day break. She also said 35mg is a lot of hormones so she wants to put me on the 20mg pill instead. She told me I have 2 options: 1. Take the pill back to back for 3 months then take a 4 day break. 2. Take the pill forever until I get breakthrough bleeding then take a 4 day break. I am planning to do option 1 as I prefer to be prepared for bleeding. However, I was wondering what's the reasoning behind this or if anyone has experience with this? As I was perfectly happy with my last prescription.

I've run out of my health insurance and am planning to go to Planned Parenthood so I can take birth control. I've tried a few different birth control pills and have been close to giving up on them all together. I tri

# Sample data

- same number of texts per source
- same number of texts per method
- same number of sentences per text

In [8]:
sampled_dataframes = []
for _df in dataframes:
    sampled_dataframes.append(_df.groupby('text_type').sample(n=400, random_state=1))
    
sampled_df = pd.concat(sampled_dataframes)

In [9]:
sampled_df['source'].value_counts()

reddit-posts       1200
reddit-comments    1200
twitter-posts      1200
twitter-replies    1200
webmd-reviews      1200
Name: source, dtype: int64

In [10]:
sampled_df['text_type'].value_counts()

implant    2000
iud        2000
pill       2000
Name: text_type, dtype: int64

# Prepare for Prodigy

In [11]:
def get_data_dicts(df):
    data_dicts = []
    for i, r in df.iterrows():

        _full_text = r['text']
        _sentences = [s.text.strip() for s in nlp(r['text']).sents]

        if 'title' in r and not pd.isnull(r['title']):
            _sentences += [s.text for s in nlp(r['title']).sents]
            _full_text = '[TITLE: ' + r['title'].strip() + '] \n\n' + r['text']

        _sentences = [s for s in _sentences if len(s.split()) >= 3]

        if len(_sentences) >= 1:
            for _sentence in random.sample(_sentences, 1):
                data_dicts.append({'text': _sentence,
                                   'meta': {'ID': r['id'],
                                            'Source': r['source'],
                                            'Method': r['text_type'],
                                            'Full Text': _full_text}})
    return data_dicts

In [13]:
for _source in sampled_df['source'].unique():
    print(_source)
    _data_dicts = get_data_dicts(sampled_df[sampled_df['source'] == _source])
    random.shuffle(_data_dicts)
    with jsonlines.open(data_directory_path + '/labeling/label-discourse/sampled-sentences.prodigy.' + _source + '.jsonl', 'w') as writer:
        writer.write_all(_data_dicts)

reddit-posts


# Repeat to create test data

In [None]:
for _df in dataframes:
    print(_df['text_type'].value_counts())

pill       36921
iud        24657
implant     7380
Name: text_type, dtype: int64
iud        117631
pill       117283
implant     29998
Name: text_type, dtype: int64
pill       226762
iud        217728
implant     55306
Name: text_type, dtype: int64
iud        147680
pill        39039
implant     25177
Name: text_type, dtype: int64
pill       14873
iud         2354
implant      883
Name: text_type, dtype: int64


In [None]:
sampled_dataframes = []
for _df in dataframes:
    sampled_dataframes.append(_df.groupby('text_type').sample(n=800, random_state=1))
    
sampled_df = pd.concat(sampled_dataframes)

In [None]:
sampled_df['source'].value_counts()

reddit-posts       2400
reddit-comments    2400
twitter-posts      2400
twitter-replies    2400
webmd-reviews      2400
Name: source, dtype: int64

In [None]:
test_dicts = []
for _source in sampled_df['source'].unique():
    print(_source)
    test_dicts += get_data_dicts(sampled_df[sampled_df['source'] == _source])
test_df = pd.DataFrame(test_dicts)

reddit-posts
reddit-comments
twitter-posts
twitter-replies
webmd-reviews


In [None]:
len(test_df.index)

11993

In [None]:
test_df.to_csv(data_directory_path + '/labeling/label-discourse/sampled-sentences.test.csv')