In [1]:
from collections import defaultdict
import json
import random
import re

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import torch

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

<br><br>

# Load datasets (Reddit, WebMD, Twitter)

## Reddit

In [86]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [87]:
len(reddit_posts_df.index), len(reddit_comments_df.index)

(72731, 238568)

In [88]:
reddit_comments_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,parent_id,created_utc,text,year,month,source,tokens_text,text_type
110121,110121,3445,dyxfbpz,t1_dyxeuhs,1526247000.0,"Yeah, honestly I would call the doctor today.\...",2018,5,reddit-comments,yeah honestly would call doctor today feeling ...,iud
44017,44017,2248,db6hxqj,t3_5i4dy8,1481711000.0,"I have never had a child and I got the Mirena,...",2016,12,reddit-comments,never child got mirena m also rather small acc...,iud
236674,236674,7823,g4u0p79,t3_iqsyga,1599839000.0,They’ll test you for pregnancy. You should be ...,2020,9,reddit-comments,test pregnancy fine tbh,iud


In [89]:
reddit_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,source,tokens_text,text_type
27141,667,8bae4p,1523387378,"I had my pill free week, and I missed the firs...",Pregnancy risk?,2018,4,https://www.reddit.com/r/birthcontrol/comments...,Mistake or Risk?,reddit-posts,pregnancy risk pill free week missed first pil...,pill
52251,465,eu02rj,1580002426,So I have a doctor appointment on the 29th (4 ...,"Which birth control is better? IUD, pills or a...",2020,1,https://www.reddit.com/r/birthcontrol/comments...,Which Method?,reddit-posts,birth control better iud pills contraceptive i...,implant
12581,329,4mnlkt,1465136766,Hi there ladies!\n\nI'm 19 years old and have ...,"IUD's, depression, and weight gain?",2016,6,https://www.reddit.com/r/birthcontrol/comments...,Side Effects!?,reddit-posts,iud depression weight gain hi ladies m NUM yea...,iud


## WebMD

In [90]:
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')

In [91]:
len(webmd_df.index)

18110

In [92]:
webmd_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
1246,1246,1293,1293,w1322,2012-05-30,2012,"I started Yaz, and had a few side effects, nau...",beyaz,,webmd-reviews,pill,283,started yaz side effects nausea dizzy spells u...
13917,13917,15773,15773,w16167,2017-08-01,2017,"I got my Skyla inserted in early May, so I am ...",skyla-device,,webmd-reviews,iud,94,got skyla inserted early may NUM month mark de...
2678,2678,3642,3642,w3731,2008-11-08,2008,I went on Femcon FE after being off another bc...,femcon-fe-tablet-chewable,,webmd-reviews,pill,107,went femcon fe another bc pill almost year exp...


## Twitter

In [93]:
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

In [94]:
len(twitter_posts_df.index), len(twitter_replies_df.index)

(513017, 244140)

In [95]:
twitter_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,text_type,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens
210317,239297,twitter-posts,My new IUD | Scarleteen http://fb.me/NiJnf4rB,new iud scarleteen http //fb me/nijnfNUMrb,iud,2011-07-18T15:03:55.000Z,2011,7,92972876164116480,92972876164116480,1,0,0,0,6
68370,73144,twitter-posts,I'm not buying into the IUD hype,buying iud hype,iud,2017-09-11T15:38:04.000Z,2017,9,907267004360544257,907267004360544257,0,0,0,0,3
306256,350526,twitter-posts,sex drive and birth control pill https://t.co/...,sex drive birth control pill https //t co/iNUM...,pill,2017-08-22T14:03:34.000Z,2017,8,899995465156100096,899995465156100096,0,0,0,0,8


In [96]:
twitter_replies_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,text_type,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens
159036,170224,twitter-replies,"Fine, IUDs work great. Plenty of my friends u...",fine iuds work great plenty friends use never ...,iud,2020-08-04T21:34:11.000Z,2020,8,1290762983329726466,1290747229091688449,0,3,0,0,26
221990,244841,twitter-replies,"I can imagine dear, visit the hospital and g...",imagine dear visit hospital get recommendation...,pill,2018-09-01T10:40:34.000Z,2018,9,1035839829211336704,1035555295634890752,1,0,1,0,27
107799,112917,twitter-replies,not necessarily completely natural but the iu...,necessarily completely natural iud non hormonal,iud,2016-05-13T17:32:40.000Z,2016,5,731175342228938752,730945245857251330,0,0,1,0,6


<br><br><br><br>

# Combine into one dataframe

In [97]:
combined_df = pd.concat([reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df])
len(combined_df)

1086566

In [98]:
combined_df['source'].value_counts()

twitter-posts      513017
twitter-replies    244140
reddit-comments    238568
reddit-posts        72731
webmd-reviews       18110
Name: source, dtype: int64

<br><br><br><br>

# **Load the labeled data**

In [52]:
labeled_df = pd.read_csv(data_directory_path + '/labeling/final_labeled_df.csv')
len(labeled_df.index)

1227

In [57]:
labeled_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Label,Text,Source,Decision,Type,Tokens
787,140,140,964277082179166200,DISCOURSE,Nexplanon is such a cock block man,twitter-posts,two or more,Implant,nexplanon cock block man
726,71,71,398908986256146400,PROVIDING INFORMATIONAL SUPPORT,Oral contraceptive pills and hormone replaceme...,twitter-posts,two or more,Pill,oral contraceptive pills hormone replacement t...
1211,321,321,1328420887687696400,PROVIDING EXPERIENCES,"@sheeeiidaa I’ve had nexplanon for 4 years, th...",twitter-replies,two or more,Implant,


In [56]:
for _source in labeled_df['Source'].unique():
    _df = labeled_df[labeled_df['Source'] == _source]
    print(_source)
    print(_df['Label'].value_counts())
    print()

reddit-posts
SEEKING INFORMATIONAL SUPPORT      159
SEEKING EXPERIENCES                 94
SEEKING EMOTIONAL SUPPORT           59
PROVIDING EXPERIENCES               29
PROVIDING INFORMATIONAL SUPPORT      4
DISCOURSE                            2
PROVIDING EMOTIONAL SUPPORT          2
Name: Label, dtype: int64

reddit-comments
PROVIDING EXPERIENCES              133
PROVIDING INFORMATIONAL SUPPORT    127
PROVIDING EMOTIONAL SUPPORT         32
DISCOURSE                            8
SEEKING INFORMATIONAL SUPPORT        6
SEEKING EMOTIONAL SUPPORT            4
SEEKING EXPERIENCES                  3
Name: Label, dtype: int64

twitter-posts
PROVIDING INFORMATIONAL SUPPORT    96
DISCOURSE                          94
PROVIDING EXPERIENCES              49
SEEKING EXPERIENCES                10
SEEKING INFORMATIONAL SUPPORT      10
SEEKING EMOTIONAL SUPPORT           9
PROVIDING EMOTIONAL SUPPORT         1
Name: Label, dtype: int64

twitter-replies
PROVIDING EXPERIENCES              134
DISCOURSE

# Filter and balance for a single label

In [79]:
def get_train_and_test(df, target_label):

    df_positive = df[df['Label'] == target_label]
    df_positive['Target Label'] = df_positive['Label']

    df_negative = df[df['Label'] != target_label]
    df_negative = df_negative[~df_negative['ID'].isin(df_positive['ID'].tolist())]
    df_negative['Target Label'] = ['NOT ' + target_label]*len(df_negative.index)

    if len(df_negative.index) > len(df_positive.index):
        df_negative = df_negative.sample(len(df_positive.index))
    df_combined = pd.concat([df_negative, df_positive])
    df_combined = df_combined.sample(frac=1)

    all_texts = df_combined['Text'].tolist()
    all_labels = df_combined['Target Label'].tolist()

    num_training = int(len(all_texts)*0.8)

    train_texts = all_texts[:num_training]
    train_labels = all_labels[:num_training]

    test_texts = all_texts[num_training:]
    test_labels = all_labels[num_training:]

    return train_texts, train_labels, test_texts, test_labels

In [105]:
def process_string(text, 
                   lowercase=True, 
                   remove_punctuation=True, 
                   numbers='replace'):
    if lowercase:
        text = text.lower()
    if numbers == 'replace':
        text = re.sub('[0-9]+', 'NUM', text)
    elif numbers == 'remove':
        text = re.sub('[0-9]+', ' ', text)
    if remove_punctuation:
        text = re.sub(r'[^\sA-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ/]', ' ', text)
    text = ' '.join(text.split())
    return text

In [107]:
social_goal_id_label_dict = defaultdict(lambda: defaultdict(str))

for _source in labeled_df['Source'].unique():

    _df = labeled_df[labeled_df['Source'] == _source]

    for _label, _count in _df['Label'].value_counts().iteritems():

        if _count >= 50:

            train_texts, train_labels, test_texts, test_labels = get_train_and_test(_df, _label)
            train_texts = [process_string(t) for t in train_texts]
            test_texts = [process_string(t) for t in test_texts] 
            print(_source, _label, len(train_texts), len(test_texts))

            vectorizer = TfidfVectorizer()
            X_train = vectorizer.fit_transform(train_texts)
            X_test = vectorizer.transform(test_texts)

            model = LogisticRegression(C=0.8).fit(X_train, train_labels)
            predictions = model.predict(X_test)

            print(classification_report(test_labels, predictions))

            vectorizer = TfidfVectorizer()
            X_train = vectorizer.fit_transform(train_texts + test_texts)
            y_train = train_labels + test_labels
            
            ids_to_label = []
            texts_to_label = []
            for i, r in combined_df[combined_df['source'] == _source].iterrows():
                ids_to_label.append(str(r['id']))
                texts_to_label.append(str(r['text']))
                if not pd.isnull(r['title']):
                    texts_to_label[-1] += ' ' + str(r['title'])
            texts_to_label = [process_string(t) for t in texts_to_label]
            X_pred = vectorizer.transform(texts_to_label)

            model = LogisticRegression(C=0.8).fit(X_train, y_train)
            predictions = model.predict(X_pred)

            for _pred, _id in zip(predictions, ids_to_label):
                social_goal_id_label_dict[_label][_id] = _pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


reddit-posts SEEKING INFORMATIONAL SUPPORT 220 56
                                   precision    recall  f1-score   support

NOT SEEKING INFORMATIONAL SUPPORT       0.86      0.24      0.38        25
    SEEKING INFORMATIONAL SUPPORT       0.61      0.97      0.75        31

                         accuracy                           0.64        56
                        macro avg       0.73      0.60      0.56        56
                     weighted avg       0.72      0.64      0.58        56



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


reddit-posts SEEKING EXPERIENCES 150 38
                         precision    recall  f1-score   support

NOT SEEKING EXPERIENCES       0.59      0.76      0.67        17
    SEEKING EXPERIENCES       0.75      0.57      0.65        21

               accuracy                           0.66        38
              macro avg       0.67      0.67      0.66        38
           weighted avg       0.68      0.66      0.66        38



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


reddit-posts SEEKING EMOTIONAL SUPPORT 94 24
                               precision    recall  f1-score   support

NOT SEEKING EMOTIONAL SUPPORT       0.78      0.54      0.64        13
    SEEKING EMOTIONAL SUPPORT       0.60      0.82      0.69        11

                     accuracy                           0.67        24
                    macro avg       0.69      0.68      0.66        24
                 weighted avg       0.70      0.67      0.66        24



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


reddit-comments PROVIDING EXPERIENCES 211 53
                           precision    recall  f1-score   support

NOT PROVIDING EXPERIENCES       0.90      0.93      0.92        29
    PROVIDING EXPERIENCES       0.91      0.88      0.89        24

                 accuracy                           0.91        53
                macro avg       0.91      0.90      0.90        53
             weighted avg       0.91      0.91      0.91        53



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


reddit-comments PROVIDING INFORMATIONAL SUPPORT 203 51
                                     precision    recall  f1-score   support

NOT PROVIDING INFORMATIONAL SUPPORT       0.85      0.79      0.81        28
    PROVIDING INFORMATIONAL SUPPORT       0.76      0.83      0.79        23

                           accuracy                           0.80        51
                          macro avg       0.80      0.81      0.80        51
                       weighted avg       0.81      0.80      0.80        51



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


twitter-posts PROVIDING INFORMATIONAL SUPPORT 153 39
                                     precision    recall  f1-score   support

NOT PROVIDING INFORMATIONAL SUPPORT       0.90      0.86      0.88        21
    PROVIDING INFORMATIONAL SUPPORT       0.84      0.89      0.86        18

                           accuracy                           0.87        39
                          macro avg       0.87      0.87      0.87        39
                       weighted avg       0.87      0.87      0.87        39



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


twitter-posts DISCOURSE 150 38
               precision    recall  f1-score   support

    DISCOURSE       0.94      0.68      0.79        22
NOT DISCOURSE       0.68      0.94      0.79        16

     accuracy                           0.79        38
    macro avg       0.81      0.81      0.79        38
 weighted avg       0.83      0.79      0.79        38



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


twitter-replies PROVIDING EXPERIENCES 208 52
                           precision    recall  f1-score   support

NOT PROVIDING EXPERIENCES       0.81      0.68      0.74        25
    PROVIDING EXPERIENCES       0.74      0.85      0.79        27

                 accuracy                           0.77        52
                macro avg       0.78      0.77      0.77        52
             weighted avg       0.77      0.77      0.77        52



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


twitter-replies DISCOURSE 126 32
               precision    recall  f1-score   support

    DISCOURSE       0.81      0.93      0.87        14
NOT DISCOURSE       0.94      0.83      0.88        18

     accuracy                           0.88        32
    macro avg       0.88      0.88      0.87        32
 weighted avg       0.88      0.88      0.88        32



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


twitter-replies PROVIDING INFORMATIONAL SUPPORT 92 24
                                     precision    recall  f1-score   support

NOT PROVIDING INFORMATIONAL SUPPORT       0.64      0.82      0.72        11
    PROVIDING INFORMATIONAL SUPPORT       0.80      0.62      0.70        13

                           accuracy                           0.71        24
                          macro avg       0.72      0.72      0.71        24
                       weighted avg       0.73      0.71      0.71        24



In [108]:
import dill

dill.dump(social_goal_id_label_dict, open(output_directory_path + '/social_goal_id_label_dict.dill', 'wb'))