In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
## mod

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nihal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
train = pd.read_csv('drugsComTrain_raw.csv')


In [21]:
train.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [22]:
def data_preprocessing(data):

  # Creating a new column for initial sentiment classification
  data.loc[(data['rating'] >= 6), 'Review_Sentiment'] = 1
  data.loc[(data['rating'] < 6), 'Review_Sentiment'] = 0

  # converting the date into datetime format
  data['date'] = pd.to_datetime(data['date'], errors = 'coerce')

  # now extracting year, month and day from date
  data['Year'] = data['date'].dt.year
  data['month'] = data['date'].dt.month
  data['day'] = data['date'].dt.day

  # we will delete the rows so that the data does not overfits
  data = data.dropna(axis = 0)
  data = data.reset_index()

  # removing the conditions with <span> in it
  all_list = set(data.index)
  span_list = []
  for i,j in enumerate(data['condition']):
      if '</span>' in j:
          span_list.append(i)
  new_idx = all_list.difference(set(span_list))
  data = data.iloc[list(new_idx)].reset_index()
  del data['index']

  return data
    


In [23]:
df_train = data_preprocessing(train)

In [24]:

stops = set(stopwords.words('english'))

not_stop = ["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't","mightn't",
            "mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","wouldn't"]
for i in not_stop:
    stops.remove(i)

In [25]:
# Filter df_condition_train_1 based on the drugName
df_condition_train = train.groupby(['condition'])['drugName'].nunique().sort_values(ascending=False)
df_condition_train = pd.DataFrame(df_condition_train).reset_index()

df_condition_train_1 = df_condition_train[df_condition_train['drugName'] == 1].reset_index()

# Create a set of all indices in df_train
all_indices = set(df_train.index)

# Find indices where the condition matches those in df_condition_train_1
condition_indices = set()
for condition in df_condition_train_1['condition']:
    condition_indices.update(df_train[df_train['condition'] == condition].index)

# Identify indices to keep in the new DataFrame
new_indices = all_indices.difference(condition_indices)

# Create a new DataFrame with the selected indices
train_proc = df_train.iloc[list(new_indices)].reset_index(drop=True)


In [26]:
stemmer = SnowballStemmer('english')

def review_to_words(raw_review):
    # 1. Delete HTML
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. Make a space
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. lower letters
    words = letters_only.lower().split()
    # 5. Stopwords
    meaningful_words = [w for w in words if not w in stops]
    # 6. Stemming
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. space join words
    return( ' '.join(stemming_words))

In [27]:
%time df_train['review_clean'] = df_train['review'].apply(review_to_words)



CPU times: total: 42.3 s
Wall time: 1min 39s


In [28]:
count_df1 = df_train[['condition','review']].groupby('condition').aggregate({'review':'count'}).reset_index().sort_values('review',ascending=False)
target_conditions1 = count_df1[count_df1['review']>1000]['condition'].values

In [29]:
def condition_parser1(x):
    if x in target_conditions1:
        return x
    else:
        return "OTHER"

df_train['condition'] = df_train['condition'].apply(lambda x: condition_parser1(x))

In [30]:
df_train.head()

Unnamed: 0,level_0,uniqueID,drugName,condition,review,rating,date,usefulCount,Review_Sentiment,Year,month,day,review_clean
0,0,206461,Valsartan,OTHER,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,1.0,2012,5,20,no side effect take combin bystol mg fish oil
1,1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,1.0,2010,4,27,son halfway fourth week intuniv becam concern ...
2,2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,0.0,2009,12,14,use take anoth oral contracept pill cycl happi...
3,3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,2015-11-03,10,1.0,2015,11,3,first time use form birth control glad went pa...
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,1.0,2016,11,27,suboxon complet turn life around feel healthie...
