In [15]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nlp_pipeline import extract_bow_from_raw_text
from nltk.stem.wordnet import WordNetLemmatizer


from nltk.util import ngrams
from nltk import pos_tag

from nltk import RegexpParser

import os  

import unicodedata
import sys

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
import pandas as pd
from math import log
import string
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
good_qual = pd.read_csv('good_qual_drugs.csv')
poor_qual = pd.read_csv('poor_qual_drugs.csv')

joined = pd.concat([good_qual, poor_qual])
joined['score'] = joined['rating'].apply(lambda x: 1 if x > 5 else 0)


In [31]:
good_corpus =[good_qual['review'][i] for i in range(len(good_qual))]
good_corpus = ','.join(good_corpus)

bad_corpus = [poor_qual['review'][i] for i in range(len(poor_qual))]
bad_corpus = ','.join(bad_corpus)

In [29]:
joined_corpus = good_corpus + bad_corpus

In [32]:
joined_corpus = ','.join(joined_corpus)

In [19]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

def vectorize(documents):
    '''Input: tokenized documents
    Compute Bag-of-Word, TF, TFIDF using sklearn
    Return: bow,tf,tfidf'''
    
    corpus = [' '.join(row) for row in documents]

    cv = CountVectorizer(ngram_range=(1,1))
    tf = cv.fit_transform(corpus).todense()
    bow = cv.vocabulary_
    
    tv = TfidfVectorizer()
    tfidf = tv.fit_transform(corpus).todense()

    return bow,tf,tfidf


In [None]:
rev_tokens = sent_tokenize(remove_accents(joined_corpus)) #tokenize each line of the review column by removing accents into sentences


rev_tokens = [rev_tokens for rev_tokens in map(word_tokenize, rev_tokens)] #tokenize each sentence into its words

rev_tokens_lower = [[word.lower() for word in sent]
                 for sent in rev_tokens] # make all words lower case


stopwords_ = set(stopwords.words('english'))
punctuation_ = set(string.punctuation)
punctuation_.update(['...', '```'])

rev_tokens = list(map(filter_tokens, rev_tokens_lower)) 

wordnet = WordNetLemmatizer()

docs_lemma = [[wordnet.lemmatize(word) for word in sent] for sent in rev_tokens]





In [20]:
# good_rev_tokens = sent_tokenize(remove_accents(good_corpus))

# poor_rev_tokens = sent_tokenize(remove_accents(bad_corpus))

# good_tokens = [good_rev_tokens for good_rev_tokens in map(word_tokenize, good_rev_tokens)]

# list(enumerate(good_tokens))

# poor_tokens = [poor_rev_tokens for poor_rev_tokens in map(word_tokenize, poor_rev_tokens)]

# list(enumerate(poor_tokens))



# good_tokens_lower = [[word.lower() for word in sent]
#                  for sent in good_rev_tokens]

# poor_tokens_lower = [[word.lower() for word in sent]
#                  for sent in poor_rev_tokens]


# good_tokens_filtered = list(map(filter_tokens, good_tokens_lower))


# poor_tokens_filtered = list(map(filter_tokens, poor_tokens_lower))

In [41]:
docs = [i for i in joined['review']]



['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\rWe have tried many different medications and so far this is the most effective."',
 '"This is my first time using any form of birth control. I\'m glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subs

In [None]:
bows = [extract_bow_from_raw_text(i) for i in docs]
bows

[['b', 'side', 'effect', 'combin', 'bystol', 'mg', 'fish', 'oil'],
 ['b',
  'son',
  'fourth',
  'week',
  'intuniv',
  'concern',
  'last',
  'week',
  'highest',
  'dose',
  'day',
  'bed',
  'cranki',
  'hour',
  'drive',
  'home',
  'school',
  'vacat',
  'unusu',
  'doctor',
  'monday',
  'morn',
  'few',
  'day',
  'school',
  'morn',
  'last',
  'day',
  'problem',
  'free',
  'agreeabl',
  'emot',
  'good',
  'thing',
  'less',
  'cranki',
  'thing',
  'overal',
  'behavior',
  '\\r\\rwe',
  'mani',
  'differ',
  'medic',
  'effect'],
 ['b',
  'first',
  'time',
  'form',
  'birth',
  'control',
  'i\\',
  'glad',
  'patch',
  'month',
  'first',
  'libido',
  'onli',
  'downsid',
  'period',
  '5-6',
  'day',
  'exact',
  'period',
  '3-4',
  'day',
  'max',
  'cramp',
  'first',
  'day',
  'period',
  'cramp',
  'birth',
  'control',
  'other',
  'happi',
  'patch'],
 ['b',
  'suboxon',
  'life',
  'i\\',
  'job',
  'money',
  'pocket',
  'save',
  'none',
  'suboxon',
  'spe

In [81]:
joined.to_csv('poor_good.csv')

In [80]:
y_target = joined['score']