In [79]:
import pandas as pd
# preparing dataframes out of train and test csv
df_train = pd.read_csv("../dataset/train.csv", parse_dates=["drug_approved_by_UIC"])
df_test = pd.read_csv("../dataset/test.csv", parse_dates=["drug_approved_by_UIC"])

In [80]:
# cleaning review text 

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


#list of stop words in 
stops = set(stopwords.words('english'))
#removing words which are not stop words from "stops" (key elements in emotional analysis)
not_stop = ["aren't","couldn't","didn't","doesn't","don't",
            "hadn't","hasn't","haven't","isn't","mightn't",
            "mustn't","needn't","no","nor","not","shan't",
            "shouldn't","wasn't","weren't","wouldn't"]

for i in not_stop:
    stops.remove(i)

import re

stemmer = SnowballStemmer('english')
def review_to_words(raw_review):
    '''
    method that returns cleaned review after excluding all stop words.
    '''
    # 1. Delete HTML 
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. Make a space
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. lower letters
    words = letters_only.lower().split()
    # 5. Stopwords 
    meaningful_words = [w for w in words if not w in stops]
    # 6. Stemming
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. space join words
    return( ' '.join(stemming_words))

# stemmed review added to dataframe
%time df_train['review_clean'] = df_train['review_by_patient'].apply(review_to_words)
%time df_test['review_clean'] = df_test['review_by_patient'].apply(review_to_words)


#predicting sentiments from review clean in both datasets.

from textblob import TextBlob # for using NLP functionality
from tqdm import tqdm
reviews_train = df_train['review_clean']
reviews_test = df_test['review_clean']

Predict_Sentiment = []
for review in tqdm(reviews_train):
    blob = TextBlob(review)
    '''
    if the polarity is >0, it is considered positive, 
    <0 -is considered negative and ==0 is considered neutral.
    '''
    Predict_Sentiment += [blob.sentiment.polarity]
df_train["Predict_Sentiment"] = Predict_Sentiment
Predict_Sentiment = []
for review in tqdm(reviews_test):
    blob = TextBlob(review)
    '''
    if the polarity is >0, it is considered positive, 
    <0 -is considered negative and ==0 is considered neutral.
    '''
    Predict_Sentiment += [blob.sentiment.polarity]
df_test["Predict_Sentiment"] = Predict_Sentiment


from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df_train["Predict_Sentiment_compound"] = df_train["review_clean"].apply(lambda x: sid.polarity_scores(x))
df_test["Predict_Sentiment_compound"] = df_test["review_clean"].apply(lambda x: sid.polarity_scores(x))


CPU times: user 36.4 s, sys: 575 ms, total: 37 s
Wall time: 40.5 s


  0%|          | 0/32165 [00:00<?, ?it/s]

CPU times: user 14.9 s, sys: 303 ms, total: 15.2 s
Wall time: 22.4 s


100%|██████████| 32165/32165 [00:22<00:00, 1449.16it/s]
100%|██████████| 10760/10760 [00:09<00:00, 1114.12it/s]


In [81]:
df_train.head()

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,review_clean,Predict_Sentiment,Predict_Sentiment_compound
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,8.022969,no side effect take combin bystol mg fish oil,0.0,"{'neg': 0.216, 'neu': 0.784, 'pos': 0.0, 'comp..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,7.858458,son halfway fourth week intuniv becam concern ...,0.114583,"{'neg': 0.056, 'neu': 0.823, 'pos': 0.121, 'co..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,6.341969,use take anoth oral contracept pill cycl happi...,0.105,"{'neg': 0.03, 'neu': 0.926, 'pos': 0.044, 'com..."
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,6.590176,suboxon complet turn life around feel healthie...,0.147037,"{'neg': 0.073, 'neu': 0.707, 'pos': 0.219, 'co..."
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,2015-11-28,43,6.144782,nd day mg start work rock hard erect howev exp...,-0.013889,"{'neg': 0.196, 'neu': 0.702, 'pos': 0.103, 'co..."


In [82]:
df_train = pd.concat([df_train.drop(['Predict_Sentiment_compound'], axis=1), df_train['Predict_Sentiment_compound'].apply(pd.Series)], axis=1)
df_test = pd.concat([df_test.drop(['Predict_Sentiment_compound'], axis=1), df_test['Predict_Sentiment_compound'].apply(pd.Series)], axis=1)



In [83]:
df_train.head()

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,review_clean,Predict_Sentiment,neg,neu,pos,compound
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,8.022969,no side effect take combin bystol mg fish oil,0.0,0.216,0.784,0.0,-0.296
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,7.858458,son halfway fourth week intuniv becam concern ...,0.114583,0.056,0.823,0.121,0.6929
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,6.341969,use take anoth oral contracept pill cycl happi...,0.105,0.03,0.926,0.044,0.2732
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,6.590176,suboxon complet turn life around feel healthie...,0.147037,0.073,0.707,0.219,0.8934
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,2015-11-28,43,6.144782,nd day mg start work rock hard erect howev exp...,-0.013889,0.196,0.702,0.103,-0.4881


In [135]:
# # t_min = -1
# # t_max = 1

# # r_min = min(df_train['effectiveness_rating'])
# # r_max = max(df_train['effectiveness_rating'])

# r_min = min(df_train['number_of_times_prescribed'])
# r_max = max(df_train['number_of_times_prescribed'])

# denom = r_max - r_min
# numer = t_max - t_min
# def normalizing_param(val):
#     return ((abs(int(val)-r_min)/denom)*2)-1
feats = ['effectiveness_rating', 'number_of_times_prescribed','Predict_Sentiment']
train_stats = df_train[feats].describe()
# train_stats.pop("base_score")
train_stats = train_stats.transpose()
train_stats

def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
    
    

In [136]:
train_stats["mean"]

effectiveness_rating           6.998912
number_of_times_prescribed    27.893207
Predict_Sentiment              0.080379
Name: mean, dtype: float64

In [137]:
df_train[:2]

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,review_clean,Predict_Sentiment,neg,neu,pos,compound
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,8.022969,no side effect take combin bystol mg fish oil,0.0,0.216,0.784,0.0,-0.296
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,7.858458,son halfway fourth week intuniv becam concern ...,0.114583,0.056,0.823,0.121,0.6929


In [138]:
target = df_train['base_score']
# feats = ['effectiveness_rating', 'number_of_times_prescribed', 'Predict_Sentiment']
# df_train[feats].head()
normed_train_data = norm(df_train[feats])
normed_test_data = norm(df_test[feats])

# normalized_predict_sentiment_train = df_train['Predict_Sentiment'].apply(normalizing_param)
# normalized_predict_sentiment_test = df_test['Predict_Sentiment'].apply(normalizing_param)

# df_train['Predict_Sentiment_normalized'] = normalized_predict_sentiment_train
# df_test['Predict_Sentiment_normalized'] = normalized_predict_sentiment_test



In [102]:
from sklearn.model_selection import train_test_split
trn_x, val_x, trn_y, val_y = train_test_split(df_train[feats], target, test_size=0.2, random_state=42)

In [139]:
normed_train_data.head()

Unnamed: 0,effectiveness_rating,number_of_times_prescribed,Predict_Sentiment
0,0.61106,-0.024937,-0.358732
1,0.305696,4.581553,0.152656
2,-0.610395,-0.304118,0.109886
3,0.61106,0.254245,0.297498
4,-1.526487,0.421753,-0.420719


In [181]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

def build_model():
    model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=[len(df_train[feats].keys())]),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
    ])
    
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        0.001,
        decay_steps=30000, #NEEDS-ATTENTION hard coding
        decay_rate=1,
        staircase=False)
    
    optimizer = tf.keras.optimizers.Adam(lr_schedule)

    model.compile(loss='mse',
                    optimizer=optimizer,
                    metrics=['mae', 'mse'])
    return model

In [183]:
model = build_model()

In [184]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 32)                128       
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 32)                1056      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 33        
Total params: 2,273
Trainable params: 2,273
Non-trainable params: 0
___________________________________________________

In [185]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

array([[ 0.05147673],
       [-0.29948655],
       [-0.02644172],
       [ 0.05154605],
       [-0.11150815],
       [-0.01120074],
       [ 0.04697345],
       [ 0.02039583],
       [ 0.07668667],
       [ 0.0143289 ]], dtype=float32)

In [186]:

train_labels = df_train['base_score']

In [187]:
train_labels

0        8.022969
1        7.858458
2        6.341969
3        6.590176
4        6.144782
           ...   
32160    6.963020
32161    0.899076
32162    6.241812
32163    7.940428
32164    8.205393
Name: base_score, Length: 32165, dtype: float64

In [190]:
# The patience parameter is the amount of epochs to check for improvement
EPOCHS = 5000
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)
early_history = model.fit(normed_train_data, train_labels, 
                    epochs=EPOCHS, validation_split = 0.3, verbose=0, 
                    callbacks=[early_stop, tfdocs.modeling.EpochDots()])


Epoch: 0, loss:1.5042,  mae:1.0418,  mse:1.5042,  val_loss:1.6606,  val_mae:1.0929,  val_mse:1.6606,  
....................................................................................................
Epoch: 100, loss:1.4872,  mae:1.0372,  mse:1.4872,  val_loss:1.6288,  val_mae:1.0839,  val_mse:1.6288,  
....................................................................................................
Epoch: 200, loss:1.4865,  mae:1.0380,  mse:1.4865,  val_loss:1.6507,  val_mae:1.0907,  val_mse:1.6507,  
..................................................................

In [175]:
test_predictions = model.predict(normed_test_data).flatten()

In [176]:
len(test_predictions)

10760

In [177]:
submission_df = pd.DataFrame()
submission_df['patient_id'] = df_test['patient_id']
submission_df['base_score'] = test_predictions

In [178]:
submission_df.to_csv('../dataset/test_170030.csv', index=False)