# Imports

In [1]:
from collections import defaultdict
import random
import re

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)

# Directory paths

In [2]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'

# Load data

In [3]:
label_df = pd.read_csv(data_directory_path + '/labeling/label-discourse/final.majority_labels.csv')
len(label_df.index)

750

In [4]:
test_df = pd.read_csv(data_directory_path + '/labeling/label-discourse/sampled-sentences.test.csv')
len(test_df)

11993

In [5]:
label_df.sample(3)

Unnamed: 0.1,Unnamed: 0,ID,Label,Text,Source,Decision
379,379,578001012885041200,SHARING-DESCRIBING ADDITIONAL RESEARCH,Colorado Politicians Are Sporting IUD Shaped E...,twitter-posts,two or more
1,1,inocqw,SEEKING EXPERIENCES,Has anyone taken Cytotec before getting an IUD...,reddit-posts,two or more
12,12,emqnn9,SEEKING INFORMATION,"If not, just ignore this, but here is my quest...",reddit-posts,two or more


In [6]:
test_df.sample(3)

Unnamed: 0.1,Unnamed: 0,text,meta
5258,5258,Mommy Fan Question: im just wondering if anyon...,"{'ID': 345567629320286210, 'Source': 'twitter-..."
6278,6278,Now I’m racing to get an IUD.,"{'ID': 810923179074064384, 'Source': 'twitter-..."
3049,3049,i don’t know why i’m so anxious about it!,"{'ID': 'ejep66m', 'Source': 'reddit-comments',..."


In [7]:
combined_df = pd.read_csv(data_directory_path + '/combined.csv')
len(combined_df.index)

  exec(code_obj, self.user_global_ns, self.user_ns)


1063672

In [8]:
combined_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,created_utc,text,title,year,month,url,link_flair_text,...,headache,hair loss,fatigue,nausea,discharge,heart attack,infection,blood pressure,skin,bleeding
282940,213982,2483,ga0avem,1603601000.0,I’m so sorry! I was on Trinessa for a few mont...,,2020,10.0,,,...,0,0,0,0,0,0,0,0,0,0
580170,246300,309289,1094026783987548161,,Thank Ima get this Nexplanon Removed,,2019,2.0,,,...,0,0,0,0,0,0,0,0,0,0
430396,96526,115181,771774574220500992,,I'm celebrating Labor Day weekend by getting a...,,2016,9.0,,,...,0,0,0,0,0,0,0,0,0,0


In [10]:
combined_df['divide_by_me'] = combined_df['source'] + '_' + combined_df['text_type']

combined_df['divide_by_me'].value_counts()

twitter-posts_pill         226762
twitter-posts_iud          217728
twitter-replies_iud        147680
reddit-comments_iud        117631
reddit-comments_pill       117283
twitter-posts_implant       55306
twitter-replies_pill        39039
reddit-posts_pill           36921
reddit-comments_implant     29998
twitter-replies_implant     25177
reddit-posts_iud            24657
webmd-reviews_pill          14873
reddit-posts_implant         7380
webmd-reviews_iud            2354
webmd-reviews_implant         883
Name: divide_by_me, dtype: int64

In [13]:
df_to_classify =  combined_df.groupby('divide_by_me').apply(lambda x: x.sample(n=800)).reset_index(drop = True)
len(df_to_classify)

12000

In [14]:
df_to_classify['divide_by_me'].value_counts()

reddit-comments_implant    800
reddit-comments_iud        800
reddit-comments_pill       800
reddit-posts_implant       800
reddit-posts_iud           800
reddit-posts_pill          800
twitter-posts_implant      800
twitter-posts_iud          800
twitter-posts_pill         800
twitter-replies_implant    800
twitter-replies_iud        800
twitter-replies_pill       800
webmd-reviews_implant      800
webmd-reviews_iud          800
webmd-reviews_pill         800
Name: divide_by_me, dtype: int64

# Try training a simple model

In [15]:
def binarize_label(label, target_label):
    if label == target_label:
        return 1
    return 0

In [16]:
from sklearn.metrics import f1_score

In [17]:
# TODO: predict for a smaller set that is balanced across side effects

In [18]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 30:

        _train_df, _test_df = train_test_split(_binarized_df, test_size=0.33, random_state=42)

        _train_texts = _train_df['Text']
        _test_texts = _test_df['Text']

        _train_sources = _train_df['Source'].tolist()
        _test_sources = _test_df['Source'].tolist()

        # _train_texts = [_train_sources[i] + ' ' + ' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_train_texts)]
        # _test_texts = [_train_sources[i] + ' ' + ' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_test_texts)]

        _train_texts = [' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_train_texts)]
        _test_texts = [' '.join([w.text.lower() for w in nlp(t)]) for i, t in enumerate(_test_texts)]

        _train_labels = _train_df['Label']
        _test_labels = _test_df['Label']

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts)
        _X_test = _vectorizer.transform(_test_texts)

        _model = LogisticRegression(C=1).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print(_target_label)
        print(classification_report(_test_labels, _predictions))

        if f1_score(_test_labels, _predictions) >= 0.65:

            _test_texts = [' '.join([w.text.lower() for w in nlp(r['text'])]) for i, r in df_to_classify.iterrows()]
            _X_test = _vectorizer.transform(_test_texts)
            _predictions = _model.predict(_X_test)
            df_to_classify[_target_label] = _predictions

SHARING PERSONAL EXPERIENCES
              precision    recall  f1-score   support

           0       0.84      0.72      0.78       104
           1       0.73      0.85      0.79        93

    accuracy                           0.78       197
   macro avg       0.79      0.79      0.78       197
weighted avg       0.79      0.78      0.78       197

SEEKING EXPERIENCES
              precision    recall  f1-score   support

           0       0.78      1.00      0.88         7
           1       1.00      0.75      0.86         8

    accuracy                           0.87        15
   macro avg       0.89      0.88      0.87        15
weighted avg       0.90      0.87      0.87        15

SHARING PERSONAL BACKGROUND
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         8
           1       0.56      0.56      0.56         9

    accuracy                           0.53        17
   macro avg       0.53      0.53      0.53        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SHARING FUTURE PLANS
              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       1.00      0.67      0.80         9

    accuracy                           0.81        16
   macro avg       0.85      0.83      0.81        16
weighted avg       0.87      0.81      0.81        16

SEEKING INFORMATION
              precision    recall  f1-score   support

           0       0.70      0.64      0.67        11
           1       0.67      0.73      0.70        11

    accuracy                           0.68        22
   macro avg       0.68      0.68      0.68        22
weighted avg       0.68      0.68      0.68        22

SHARING OPINIONS AND PREFERENCES
              precision    recall  f1-score   support

           0       0.91      0.45      0.61        22
           1       0.60      0.95      0.73        19

    accuracy                           0.68        41
   macro avg       0.75      0.70      0.67        41


In [None]:
combined_df.to_csv(data_directory_path + '/combined.balanced.predicted_labels.csv')

In [12]:
# def process_string(text):
#     text = re.sub('[0-9]+', 'NUM', text)
#     text = ' '.join(text.split())
#     return text

In [13]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_texts = _binarized_df['Text']
        _train_labels = _binarized_df['Label']

        _test_texts = test_df['text']

        _train_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _train_texts ]
        _test_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _test_texts ]

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts_processed)
        _X_test = _vectorizer.transform(_test_texts_processed)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print('---------------------------------')
        print(_target_label)
        print('---------------------------------')
        print()

        _positive_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 1]
        _negative_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 0]

        print('POSITIVE')
        for _text in random.sample(_positive_texts, 10):
            print(' '.join(_text.split()))
        
        print()


---------------------------------
SHARING PERSONAL EXPERIENCES
---------------------------------

POSITIVE
i AM HORRIBLE at remembering to take pills, and I had a way worse experience with the depo shot (weight gain and horrible hormones!)
Every office/clinic may be different.
Hope this helps I had it for two years before giving up with it and have heard other people having the same problem with it.
The period is longer, which can be irksome, but much lighter.
Tmi, I change a super plus almost hourly.
I've been on this for almost a year and have no complaints!
but all was fine.
it makes me a little nervous because i keep feeling as if i could be pregnant because i take this drug different times of the day..
I have the Mirena IUD, and I haven't noticed it make any impact one way or another for acne, unfortunately.
I have noticed that since using this product, I have less periods and when I do get them, they aren't very heavy.

---------------------------------
SHARING INFORMATION
------