# Imports

In [17]:
from collections import defaultdict
from datetime import datetime
import json
import os
import pickle
import random

import jsonlines
import pandas as pd 

import spacy
nlp = spacy.load('en_core_web_sm')

# Directory paths

In [18]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

# Load datasets

In [19]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

dataframes = [reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df]

In [20]:
combined_df = pd.concat(dataframes)
len(combined_df)

1063672

In [21]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

# Sample data

- same number of texts per source
- same number of texts per method
- same number of sentences per text

In [8]:
sampled_dataframes = []
for _df in dataframes:
    sampled_dataframes.append(_df.groupby('text_type').sample(n=400, random_state=1))
    
sampled_df = pd.concat(sampled_dataframes)

In [9]:
sampled_df['source'].value_counts()

reddit-posts       1200
reddit-comments    1200
twitter-posts      1200
twitter-replies    1200
webmd-reviews      1200
Name: source, dtype: int64

In [10]:
sampled_df['text_type'].value_counts()

implant    2000
iud        2000
pill       2000
Name: text_type, dtype: int64

# Prepare for Prodigy

In [13]:
def get_data_dicts(df):
    data_dicts = []
    for i, r in df.iterrows():

        _full_text = r['text']
        _sentences = [s.text.strip() for s in nlp(r['text']).sents]

        if 'title' in r and not pd.isnull(r['title']):
            _sentences += [s.text for s in nlp(r['title']).sents]
            _full_text = '[TITLE: ' + r['title'].strip() + '] \n\n' + r['text']

        _sentences = [s for s in _sentences if len(s.split()) >= 3]

        if len(_sentences) >= 1:
            for _sentence in random.sample(_sentences, 1):
                data_dicts.append({'text': _sentence,
                                   'meta': {'ID': r['id'],
                                            'Source': r['source'],
                                            'Method': r['text_type'],
                                            'Full Text': _full_text}})
    return data_dicts

In [14]:
for _source in sampled_df['source'].unique():
    print(_source)
    _data_dicts = get_data_dicts(sampled_df[sampled_df['source'] == _source])
    random.shuffle(_data_dicts)
    with jsonlines.open(data_directory_path + '/labeling/label-sentences/sampled-sentences.prodigy.' + _source + '.jsonl', 'w') as writer:
        writer.write_all(_data_dicts)

reddit-posts
reddit-comments
twitter-posts
twitter-replies
webmd-reviews


# Repeat to create test data

In [23]:
for _df in dataframes:
    print(_df['text_type'].value_counts())

pill       36921
iud        24657
implant     7380
Name: text_type, dtype: int64
iud        117631
pill       117283
implant     29998
Name: text_type, dtype: int64
pill       226762
iud        217728
implant     55306
Name: text_type, dtype: int64
iud        147680
pill        39039
implant     25177
Name: text_type, dtype: int64
pill       14873
iud         2354
implant      883
Name: text_type, dtype: int64


In [24]:
sampled_dataframes = []
for _df in dataframes:
    sampled_dataframes.append(_df.groupby('text_type').sample(n=800, random_state=1))
    
sampled_df = pd.concat(sampled_dataframes)

In [25]:
sampled_df['source'].value_counts()

reddit-posts       2400
reddit-comments    2400
twitter-posts      2400
twitter-replies    2400
webmd-reviews      2400
Name: source, dtype: int64

In [26]:
test_dicts = []
for _source in sampled_df['source'].unique():
    print(_source)
    test_dicts += get_data_dicts(sampled_df[sampled_df['source'] == _source])
test_df = pd.DataFrame(test_dicts)

reddit-posts
reddit-comments
twitter-posts
twitter-replies
webmd-reviews


In [27]:
len(test_df.index)

11993

In [29]:
test_df.to_csv(data_directory_path + '/labeling/label-sentences/sampled-sentences.test.csv')