In [1]:
  
import numpy as np
import pandas as pd
import pickle
import string 
import html
import ast
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stpwrds = stopwords.words("english")
from nltk.stem import WordNetLemmatizer

def encode_reviews(review):
    return html.unescape(review)

def rm_stopword(r):
    r_n = " ".join([i for i in r if i not in stpwrds])
    return r_n
    
def lem(tokens):
    l = WordNetLemmatizer()
    out = [l.lemmatize(word) for word in tokens]
    return out

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\makul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
 data = pd.read_csv('rawdata.csv')

In [3]:
data['review'] = data['review'].apply(encode_reviews)

In [4]:
data['review']

0        "I've tried a few antidepressants over the yea...
1        "My son has Crohn's disease and has done very ...
2                            "Quick reduction of symptoms"
3        "Contrave combines drugs that were used for al...
4        "I have been on this birth control for one cyc...
5        "4 days in on first 2 weeks.  Using on arms an...
6        "I've had the copper coil for about 3 months n...
7        "This has been great for me. I've been on it f...
8        "Ive been on Methadone for over ten years and ...
9        "I was on this pill for almost two years. It d...
10       "Holy Hell is exactly how I feel. I had been t...
11       "Honestly its day one on the 3 day treatment. ...
12       "This is a waste of money.  Did not curb my ap...
13                      "No problems, watch what you eat."
14       "Ditto on rebound sleepless when discontinued....
15       "A doctor in the ER prescribed me 200 mg of Pr...
16       "I smoked for 50+ years.  Took it for one week.

In [5]:
reviews = data['review'].str.replace("[^a-zA-Z]", " ")

In [6]:
reviews = reviews.apply(lambda r: " ".join([w for w in r.split() if len(w)>2]))

In [7]:
reviews

0        tried few antidepressants over the years cital...
1        son has Crohn disease and has done very well t...
2                                 Quick reduction symptoms
3        Contrave combines drugs that were used for alc...
4        have been this birth control for one cycle Aft...
5        days first weeks Using arms and face Put vasel...
6        had the copper coil for about months now was r...
7        This has been great for been for weeks and the...
8        Ive been Methadone for over ten years and curr...
9        was this pill for almost two years does work f...
10       Holy Hell exactly how feel had been taking Bri...
11       Honestly its day one the day treatment Yes bur...
12       This waste money Did not curb appetite nor did...
13                             problems watch what you eat
14       Ditto rebound sleepless when discontinued have...
15       doctor the prescribed Provigil when was first ...
16       smoked for years Took for one week and that wa.

In [8]:
reviews = [rm_stopword(r.split()) for r in reviews]

In [9]:
reviews = [r.lower() for r in reviews]

In [10]:
reviews

['tried antidepressants years citalopram fluoxetine amitriptyline none helped depression insomnia anxiety doctor suggested changed onto mirtazapine medicine saved life thankfully side effects especially common weight gain actually lost alot weight still suicidal thoughts mirtazapine saved',
 'son crohn disease done well asacol complaints shows side effects taken many nine tablets per day one time happy results reducing bouts diarrhea drastically',
 'quick reduction symptoms',
 'contrave combines drugs used alcohol smoking opioid cessation people lose weight also helps control eating doubt obesity caused sugar carb addiction powerful drug taking five days good news seems work immediately feel hungry want food really care eat fill stomach since days know lost weight scale clothes feel little looser maybe pound two hoping months medication develop healthier habits continue without aid contrave',
 'birth control one cycle after reading reviews type similar birth controls bit apprehensive s

In [11]:
reviews = pd.Series(reviews)

In [12]:
reviews

0        tried antidepressants years citalopram fluoxet...
1        son crohn disease done well asacol complaints ...
2                                 quick reduction symptoms
3        contrave combines drugs used alcohol smoking o...
4        birth control one cycle after reading reviews ...
5        days first weeks using arms face put vaseline ...
6        copper coil months really excited thought taki...
7        this great weeks last week headaches went away...
8        ive methadone ten years currently trying get d...
9        pill almost two years work far getting pregnan...
10       holy hell exactly feel taking brisdelle years ...
11       honestly day one day treatment yes burns bit l...
12       this waste money did curb appetite make feel full
13                                      problems watch eat
14       ditto rebound sleepless discontinued done stra...
15       doctor prescribed provigil first diagnosed nar...
16       smoked years took one week think possible quit.

In [13]:
tokenized = reviews.apply(lambda r: r.split())

In [14]:
tokenized

0        [tried, antidepressants, years, citalopram, fl...
1        [son, crohn, disease, done, well, asacol, comp...
2                             [quick, reduction, symptoms]
3        [contrave, combines, drugs, used, alcohol, smo...
4        [birth, control, one, cycle, after, reading, r...
5        [days, first, weeks, using, arms, face, put, v...
6        [copper, coil, months, really, excited, though...
7        [this, great, weeks, last, week, headaches, we...
8        [ive, methadone, ten, years, currently, trying...
9        [pill, almost, two, years, work, far, getting,...
10       [holy, hell, exactly, feel, taking, brisdelle,...
11       [honestly, day, one, day, treatment, yes, burn...
12       [this, waste, money, did, curb, appetite, make...
13                                  [problems, watch, eat]
14       [ditto, rebound, sleepless, discontinued, done...
15       [doctor, prescribed, provigil, first, diagnose...
16       [smoked, years, took, one, week, think, possib.

In [15]:
reviews

0        tried antidepressants years citalopram fluoxet...
1        son crohn disease done well asacol complaints ...
2                                 quick reduction symptoms
3        contrave combines drugs used alcohol smoking o...
4        birth control one cycle after reading reviews ...
5        days first weeks using arms face put vaseline ...
6        copper coil months really excited thought taki...
7        this great weeks last week headaches went away...
8        ive methadone ten years currently trying get d...
9        pill almost two years work far getting pregnan...
10       holy hell exactly feel taking brisdelle years ...
11       honestly day one day treatment yes burns bit l...
12       this waste money did curb appetite make feel full
13                                      problems watch eat
14       ditto rebound sleepless discontinued done stra...
15       doctor prescribed provigil first diagnosed nar...
16       smoked years took one week think possible quit.

In [16]:
tokenized

0        [tried, antidepressants, years, citalopram, fl...
1        [son, crohn, disease, done, well, asacol, comp...
2                             [quick, reduction, symptoms]
3        [contrave, combines, drugs, used, alcohol, smo...
4        [birth, control, one, cycle, after, reading, r...
5        [days, first, weeks, using, arms, face, put, v...
6        [copper, coil, months, really, excited, though...
7        [this, great, weeks, last, week, headaches, we...
8        [ive, methadone, ten, years, currently, trying...
9        [pill, almost, two, years, work, far, getting,...
10       [holy, hell, exactly, feel, taking, brisdelle,...
11       [honestly, day, one, day, treatment, yes, burn...
12       [this, waste, money, did, curb, appetite, make...
13                                  [problems, watch, eat]
14       [ditto, rebound, sleepless, discontinued, done...
15       [doctor, prescribed, provigil, first, diagnose...
16       [smoked, years, took, one, week, think, possib.

In [17]:
reviews = tokenized.apply(lem)

In [18]:
reviews[24]

['this',
 'drug',
 'pretty',
 'amazing',
 'hyperhydrosis',
 'palm',
 'sol',
 'underarms',
 'since',
 'hitting',
 'puberty',
 'skeptic',
 'since',
 'previously',
 'tried',
 'botox',
 'iontophoresis',
 'drysol',
 'limited',
 'result',
 'however',
 'three',
 'day',
 'taking',
 'oxybutynin',
 'day',
 'sweat',
 'free',
 'negative',
 'thing',
 'say',
 'drug',
 'side',
 'effect',
 'gotten',
 'fairly',
 'dizzy',
 'spaced',
 'time',
 'experience',
 'dry',
 'mouth',
 'pretty',
 'often',
 'however',
 'bad',
 'comparison',
 'acne',
 'gotten',
 'used',
 'get',
 'pimple',
 'around',
 'month',
 'however',
 'one',
 'lasting',
 'wks',
 'varying',
 'size',
 'colour',
 'constantly',
 'group',
 'acne',
 'face',
 'doe',
 'anyone',
 'remedy',
 'tried',
 'acne',
 'mask',
 'cleanser',
 'tea',
 'tree',
 'oil']

In [19]:
vocabulary = {}

In [20]:
for i,r in enumerate(reviews, start=0):
        for j,w in enumerate(r , start=0):
            if w not in vocabulary:
                vocabulary[w] = [1,{i:[j]}]
            else:
                if i not in vocabulary[w][1]:
                    vocabulary[w][0] += 1
                    vocabulary[w][1][i] = [j]
                else:
                    if j not in vocabulary[w][1][i]:
                        vocabulary[w][1][i].append(j)

N = np.float64(data.shape[0])                    

In [21]:
for w in vocabulary.keys():
    pl = {}
    for i in vocabulary[w][1].keys():
        tf = (len(vocabulary[w][1][i])/len(reviews[i]))
        weight_i = (1 + np.log10(tf)) * np.log10(N/vocabulary[w][0])
        pl[i] = weight_i
    vocabulary[w].append(pl)
p = open('drugVocab.pickle',"wb")
pickle.dump(vocabulary,p)

In [22]:
tokenized = pd.Series(tokenized)

In [23]:
data['vector']=tokenized

In [24]:
data


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,vector
0,163740,Mirtazapine,Depression,"""I've tried a few antidepressants over the yea...",10,28-Feb-12,22,"[tried, antidepressants, years, citalopram, fl..."
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn's disease and has done very ...",8,17-May-09,17,"[son, crohn, disease, done, well, asacol, comp..."
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3,"[quick, reduction, symptoms]"
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35,"[contrave, combines, drugs, used, alcohol, smo..."
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4,"[birth, control, one, cycle, after, reading, r..."
5,208087,Zyclara,Keratosis,"""4 days in on first 2 weeks. Using on arms an...",4,3-Jul-14,13,"[days, first, weeks, using, arms, face, put, v..."
6,215892,Copper,Birth Control,"""I've had the copper coil for about 3 months n...",6,6-Jun-16,1,"[copper, coil, months, really, excited, though..."
7,169852,Amitriptyline,Migraine Prevention,"""This has been great for me. I've been on it f...",9,21-Apr-09,32,"[this, great, weeks, last, week, headaches, we..."
8,23295,Methadone,Opiate Withdrawal,"""Ive been on Methadone for over ten years and ...",7,18-Oct-16,21,"[ive, methadone, ten, years, currently, trying..."
9,71428,Levora,Birth Control,"""I was on this pill for almost two years. It d...",2,16-Apr-11,3,"[pill, almost, two, years, work, far, getting,..."


In [25]:
data.to_csv("fulldata.csv")

In [53]:
def topk(query):
    data = pd.read_csv('fulldata.csv', index_col='Unnamed: 0')
    p = open('drugVocab.pickle',"rb")
    vocabulary = pickle.load(p)

    q = query.replace("[^a-zA-Z]", " ").lower()
    q_vec = rm_stopword(q.split())
    q_vect = lem(q_vec.split())
    
    srtdpl = {}
    qw = {}
    for w in q_vect:
        if w in vocabulary.keys():
            if w not in srtdpl.keys():
                srtdpl[w] = sorted(vocabulary[w][2].items(), key=lambda x:x[1], reverse=True)[:10]
        if w not in qw:
            qw[w] = [1,(1/len(q_vect))]
        elif w in qw:
            qw[w][0] += 1
            qw[w][1] = (qw[w][0]/len(q_vect))
    if srtdpl == {}:
        return "No results found"
    
    topk = []
    N = data.shape[0]
    for i in range(N):
        count = 0
        sd = 0
        for w in q_vect:
            for (di,wt) in srtdpl[w]:
                if di == i: count += 1
        if count > 0 and count == len(q_vect):
            for w in q_vect:
                l = [x for x in srtdpl[w] if x[0] == i]
                sd += l[0][1] * qw[w][1]
            topk.append((i,sd))
        elif count > 0 and count < len(q_vec):
            for w in q_vect:
                l = srtdpl[w][9]
                sd += l[1] * qw[w][1]
            topk.append((i,sd))    
            
    out = [x for x in sorted(topk, key=lambda i:i[1], reverse=True)]        
    
 
 
    return out

In [54]:
query = "I have a headache"

In [55]:
res = topk(query)

In [74]:
  
print(res,"\n")
out = []
for (ind,s) in res:
    out.append( [data.loc[data.index[ind], 'drugName'], data.loc[data.index[ind], 'usefulCount'], data.loc[data.index[ind], 'condition'], data.loc[data.index[ind], 'rating'], data.loc[data.index[ind], 'review'], s*100])
#pd.set_option('display.max_columns', -1)  
#pd.set_option('display.expand_frame_repr', False)
#pd.set_option('max_colwidth', -1)
s = pd.Series(out) 
#show =  pd.DataFrame(s, columns=['Drug Name','Useful count','Condition','Rating(/10)','Review','Similarity%'])
print(s)

[(2690, 0.7694920812931749), (8903, 0.7694920812931749), (44137, 0.7694920812931749), (4591, 0.5756342210306166), (24579, 0.5756342210306166), (31922, 0.5756342210306166), (37816, 0.5756342210306166), (51167, 0.5756342210306166), (448, 0.43808988025709883), (10820, 0.43808988025709883)] 

0    [Tizanidine, 10, Cluster Headaches, 5, "It works for headaches", 76.9492081293175]                                                                                           
1    [Lactulose, 12, nan, 10, "Headache, flatulence", 76.9492081293175]                                                                                                           
2    [Ibuprofen, 4, Headache, 3, "didn't do much for my headaches", 76.9492081293175]                                                                                             
3    [Gabapentin, 51, ibromyalgia, 4, "Side effect for me is headaches.", 57.563422103061654]                                                                                

In [75]:
pd.DataFrame(out, columns=['Drug Name','Useful count','Condition','Rating(/10)','Review','Similarity%'])

IndexError: index 6 is out of bounds for axis 0 with size 6

KeyError: 0

In [63]:
out =  pd.DataFrame(out, columns=['Drug Name','Useful count','Condition','Rating(/10)','Review','Similarity%'])

In [64]:
out

IndexError: index 6 is out of bounds for axis 0 with size 6

KeyError: 0