# Imports

In [1]:
from collections import defaultdict
import random
import re

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)

# Directory paths

In [2]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'

# Load data

In [19]:
label_df = pd.read_csv(data_directory_path + '/labeling/label-discourse/final.majority_labels.csv')
len(label_df.index)

750

In [20]:
test_df = pd.read_csv(data_directory_path + '/labeling/label-discourse/sampled-sentences.test.csv')
len(test_df)

11993

In [21]:
label_df.sample(3)

Unnamed: 0.1,Unnamed: 0,ID,Label,Text,Source,Decision
334,334,1202753785929568300,SHARING INFORMATION,Birth control pill shrinks part of brain that ...,twitter-posts,two or more
292,292,972067472571498500,SHARING-DESCRIBING ADDITIONAL RESEARCH,Woman has contraceptive implant 'lost' in her ...,twitter-posts,two or more
384,384,559796711871483900,META DISCUSSION,Big debate: Is 10 years old too young to be gi...,twitter-posts,two or more


In [22]:
test_df.sample(3)

Unnamed: 0.1,Unnamed: 0,text,meta
10661,10661,"I had absolutly no pain, and only minor cramps...","{'ID': 'w10716', 'Source': 'webmd-reviews', 'M..."
5740,5740,Seachem Paraguard 100ml Freshwater Saltwater A...,"{'ID': 414120642154217472, 'Source': 'twitter-..."
857,857,Pregant with an IUD,"{'ID': 'hkbes5', 'Source': 'reddit-posts', 'Me..."


In [28]:
combined_df = pd.read_csv(data_directory_path + '/combined.csv')
len(combined_df.index)

  exec(code_obj, self.user_global_ns, self.user_ns)


1063672

In [29]:
combined_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,created_utc,text,title,year,month,url,link_flair_text,...,headache,hair loss,fatigue,nausea,discharge,heart attack,infection,blood pressure,skin,bleeding
916308,82642,106625,804097623003107328,,"yasss giirl, let them rip the iud out.",,2016,11.0,,,...,0,0,0,0,0,0,0,0,0,0
120177,51219,434,dbp1oc0,1482880000.0,The one week wait is probably due to both prev...,,2016,12.0,,,...,0,0,0,0,0,0,1,0,0,0
430945,97075,115786,770125479839985664,,Catholic Hospital Refuses to Remove IUD From B...,,2016,8.0,,,...,0,0,0,0,0,0,0,0,0,1


# Try training a simple model

In [23]:
def binarize_label(label, target_label):
    if label == target_label:
        return 1
    return 0

In [32]:
from sklearn.metrics import f1_score

In [34]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 30:

        _train_df, _test_df = train_test_split(_binarized_df, test_size=0.33, random_state=42)

        _train_texts = _train_df['Text']
        _test_texts = _test_df['Text']

        _train_sources = _train_df['Source'].tolist()
        _test_sources = _test_df['Source'].tolist()

        # _train_texts = [_train_sources[i] + ' ' + ' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_train_texts)]
        # _test_texts = [_train_sources[i] + ' ' + ' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_test_texts)]

        _train_texts = [' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_train_texts)]
        _test_texts = [' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_test_texts)]

        _train_labels = _train_df['Label']
        _test_labels = _test_df['Label']

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts)
        _X_test = _vectorizer.transform(_test_texts)

        _model = LogisticRegression(C=1).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print(_target_label)
        print(classification_report(_test_labels, _predictions))

        if f1_score(_test_labels, _predictions) >= 0.65:

            _test_texts = [' '.join([w.text.lower() for w in nlp(r['text'])]) for i, r in combined_df.iterrows()]
            _X_test = _vectorizer.transform(_test_texts)
            _predictions = _model.predict(_X_test)
            combined_df[_target_label] = _predictions

SHARING PERSONAL EXPERIENCES
              precision    recall  f1-score   support

           0       0.84      0.72      0.78       104
           1       0.73      0.85      0.79        93

    accuracy                           0.78       197
   macro avg       0.79      0.79      0.78       197
weighted avg       0.79      0.78      0.78       197



KeyboardInterrupt: 

In [None]:
combined_df.to_csv(data_directory_path + '/combined.predicted_labels.csv')

In [12]:
# def process_string(text):
#     text = re.sub('[0-9]+', 'NUM', text)
#     text = ' '.join(text.split())
#     return text

In [13]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_texts = _binarized_df['Text']
        _train_labels = _binarized_df['Label']

        _test_texts = test_df['text']

        _train_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _train_texts ]
        _test_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _test_texts ]

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts_processed)
        _X_test = _vectorizer.transform(_test_texts_processed)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print('---------------------------------')
        print(_target_label)
        print('---------------------------------')
        print()

        _positive_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 1]
        _negative_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 0]

        print('POSITIVE')
        for _text in random.sample(_positive_texts, 10):
            print(' '.join(_text.split()))
        
        print()


---------------------------------
SHARING PERSONAL EXPERIENCES
---------------------------------

POSITIVE
i AM HORRIBLE at remembering to take pills, and I had a way worse experience with the depo shot (weight gain and horrible hormones!)
Every office/clinic may be different.
Hope this helps I had it for two years before giving up with it and have heard other people having the same problem with it.
The period is longer, which can be irksome, but much lighter.
Tmi, I change a super plus almost hourly.
I've been on this for almost a year and have no complaints!
but all was fine.
it makes me a little nervous because i keep feeling as if i could be pregnant because i take this drug different times of the day..
I have the Mirena IUD, and I haven't noticed it make any impact one way or another for acne, unfortunately.
I have noticed that since using this product, I have less periods and when I do get them, they aren't very heavy.

---------------------------------
SHARING INFORMATION
------