# Imports

In [8]:
from collections import defaultdict
import random
import re

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)

# Directory paths

In [5]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'

# Load data

In [6]:
label_df = pd.read_csv(data_directory_path + '/labeling/label-sentences/labeled_by_maria.all.csv')
len(label_df.index)

895

In [7]:
test_df = pd.read_csv(data_directory_path + '/labeling/label-sentences/sampled-sentences.test.csv')
len(test_df)

11993

In [None]:
label_df.sample(3)

Unnamed: 0,Source,ID,Label,Text
516,twitter-posts,300336102865248260,narrating personal experiences,getting the contraceptive implant in possibly ...
96,reddit-posts,gjld30,narrating personal experiences,Although sometimes I feel like I am gonna have...
39,reddit-posts,c2gb30,seeking information (advice),Can I take these steri-strips (butterfly stitc...


In [None]:
test_df.sample(3)

Unnamed: 0.1,Unnamed: 0,text,meta
3859,3859,The only thing that works that late is the cop...,"{'ID': 'eyx0ilp', 'Source': 'reddit-comments',..."
11745,11745,i have been on this for 7 months and i bleed f...,"{'ID': 'w11392', 'Source': 'webmd-reviews', 'M..."
10011,10011,The entire time was awful.,"{'ID': 'w12188', 'Source': 'webmd-reviews', 'M..."


# Try training a simple model

In [9]:
def binarize_label(label, target_label):
    if label == target_label:
        return 1
    return 0

In [12]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_df, _test_df = train_test_split(_binarized_df, test_size=0.33, random_state=42)

        _train_texts = _train_df['Text']
        _test_texts = _test_df['Text']

        _train_texts = [' '.join([w.text.lower() for w in nlp(t)]) for t in _train_texts ]
        _test_texts = [' '.join([w.text.lower() for w in nlp(t)]) for t in _test_texts ]

        _train_labels = _train_df['Label']
        _test_labels = _test_df['Label']

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts)
        _X_test = _vectorizer.transform(_test_texts)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print(_target_label)
        print(classification_report(_test_labels, _predictions))

negative self-disclosure
              precision    recall  f1-score   support

           0       0.73      0.47      0.57        17
           1       0.53      0.77      0.62        13

    accuracy                           0.60        30
   macro avg       0.63      0.62      0.60        30
weighted avg       0.64      0.60      0.59        30

narrating personal experiences
              precision    recall  f1-score   support

           0       0.82      0.83      0.82        88
           1       0.84      0.83      0.83        94

    accuracy                           0.83       182
   macro avg       0.83      0.83      0.83       182
weighted avg       0.83      0.83      0.83       182

seeking experiences
              precision    recall  f1-score   support

           0       0.85      0.73      0.79        15
           1       0.73      0.85      0.79        13

    accuracy                           0.79        28
   macro avg       0.79      0.79      0.79        2

In [13]:
# def process_string(text):
#     text = re.sub('[0-9]+', 'NUM', text)
#     text = ' '.join(text.split())
#     return text

In [14]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_texts = _binarized_df['Text']
        _train_labels = _binarized_df['Label']

        _test_texts = test_df['text']

        _train_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _train_texts ]
        _test_texts_processed = [' '.join([w.text.lower() for w in nlp(t)]) for t in _test_texts ]

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts_processed)
        _X_test = _vectorizer.transform(_test_texts_processed)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print('---------------------------------')
        print(_target_label)
        print('---------------------------------')
        print()

        _positive_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 1]
        _negative_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 0]

        print('POSITIVE')
        for _text in random.sample(_positive_texts, 10):
            print(' '.join(_text.split()))
        
        print()


---------------------------------
negative self-disclosure
---------------------------------

POSITIVE
I've been on Nexplanon for about 10 months now, the first few weeks I noticed headaches, mood swings, and I started spotting and up until now I am still spotting, I had my first real period this month that lasted maybe 4-5 days and after it went back to spotting.
I've read enough to know that this is not the norm so I can't shake the feeling that the worst is yet to come so I figured I would ask...
Just had the IUD taken out wonder how fast I can get pregnant??
Like I said this is still kind of new to me
Thx SO much!!!
Sprintec has made me gain weight and have acne!
FEMINIST IDEAS: encouraged social policy makers to clean up their act.
Turns out getting testosterone shots changes your mood, who'd a thunk it?)
When the rings and patches came along I just didn't trust them much and of course everyone's heard the IUD/coil horror stories.
She referred me to a psych too, and reminded me to