In [None]:
import torch
import numpy as np
import random
import torch.optim as optim
from torch.autograd import Variable
from nltk import word_tokenizer
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch.utils.rnn import pack_padded_sequence
from sklearn.metrics import f1_score
from keras.preprocessing import.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Some hyperparametres to be used later
max_features= 120,000 # How many unique_words we are going to use
embed_size = 300
max_len = 750 # Length that each question is allowed to have
batch_size =512
n_epochs =5
n_splits =5
seed =10
debug =0


In [None]:
train_df = pd.read_csv("../input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv)
test _df = pd.read_csv("../input/kuc-hackathon-winter-2018/drugsComTest_raw.csv")
data = pd.concat([train_df,test_df])[['review','condition']]

In [None]:
# Removing all the null values
data = data[pd.notnull(data['review'])]
# Creating a length variable
data['lenght'] = data['review'].apply(lambda x: len(x))


In [None]:
# Preprocessing our Target column
# We are going to select all the conditions which have more than 3000 reviews
count_df = data[['condition','review']].groupby('condition').aggregate({'review':'count'}).reset_index().sort_values('review',ascending=False)
count_df.head()
target_conditions = count_df[count_df['review']>3000['condition'].values

In [None]:
# Since we already have selected the conditions with the 3000 and above ,we can then filter our dataframe

def condition_parser(x):
    # checks if a reviews meets the target condition criteria
    if x in target_conditions:
        return x
    else :
        return "OTHER"
# Filtering our conditions column
data['condition'] = data['condition'].apply(lambda x: condition_parser(x))
data= data[data['condition']!='"OTHER"']
# The values returned will have only the target conditions

# TEXT PREPROCESSING
In this section I will clean the data by removing numbers,patterns and contraditictions.

In [None]:
# Function to clean the text
def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text.re.sub(pattern,"",x)
def clean_numbers(x):
    if bool(re.search(r'\d',x))):
        x = re.sub('[0-9]{5,}',"#####",x)
        x = re.sub('[0-9]{4}','####',x)
        x = re.sub('[0-9]{3}','###',x)
        x = re.sub('[0-9]{2}','##',x)
        return x
# Contradictions ,I will create a contradiction dictionary,then using regular expression extract the matches
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, tex

In [None]:
# A function that preprocesses the text for  us
def preprocess_text(text):
    text = text.lower()
    text = clean_text(text)
    text = clean_numbers(text)
    text = replace_contractions(text)
    return text
data['review'] =data['review'].apply(lambda x:preprocess_text(x))

In [None]:
# Splitting our data into training and testing sets and then tokenizing them
X= data['review']
y = data['condition']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.25)
# Tokenizing the X values
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts.to_sequence(X_train)
X_test = tokenizer.texts.to_sequence(X_test)
X_train = pad_sequences(X_train,maxlen=max_len)
X_test = pad_sequences(X_test,maxlen = max_len)


# Transforming the Y values 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train.values)
y_test = le.fit_transform(y_test.values)

# LOADING GLOVE EMBEDDINGS

In [None]:
def load_glove(word_index):
    # I will load the glove pretrained model from Kaggle
    EMBEDDING_FILE =   '../input/glove840b300dtxt/glove.840B.300d.txt'
    def get_coefs_(word,*arr):
        return word,np.asarray(arr,dtype='float32')[:300]
    embeddings_index =dict(get_coefs_(*o.split(" "))for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddins_index.values)
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size=all_embs.shape[1]
    nb_words = min(max_features,len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean,emb_std,(nb_words,embed_size))
    for word,i in word_index.items():
        if i >= max_features :continue
            embedding_vector = embeddings_index.get(word)
        else :
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None :
                embedding_matrix (i) = embedding_vector
    return embedding_matrix
            
    

In [None]:
if debug :
    embedding_matrix = np.random.randn(120000,300)
    # This is to cover missing entries which are set using the above code
else :
    embedding_matrix = load_glove(tokenizer.word_index)
#np.shape(embedding_matrix)

# BILSTM MODEL 

In [None]:
class BiLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_size = 64
        drp =0.1
        n_classes = len(le.classes_)
        self.embedding =nn.Embedding(max_features,embed_size)
        self.embedding.weight =nn.Parameter(torch.tensor(embedding_matrix,dtype =torch.float32))
        self.embedding.requires_grad = False
        self.lstm = nn.LSTM(embed_size,self.hidden_size,bidirection=True,batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4,64)
        self.relu = nn.Relu()
        self.dropout = nn.Dropout(drp)
        self.fc1 = nn.Linear(64,n_classes)
    def forward(self,x):
        h_embeddings = self.embeddings(x)
        h_ltsm,_ = self.lstm(h_embeddings)
        avg_pool = torch.mean(h_lstm,1)
        max_pool,_ = torch.max(h_lstm,1)
        concatenate = torch.cat((avg_pool,max_pool)1)
        concatenate = self.relu(self.linear(concatenate))
        concatenate = self.dropout(concatenate)
        output = self.fc1(concatenate)
        return output

In [None]:
# MODEL TRAINING AND EVALUATION
model = BiLSTM()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer= optim.Adam(filter(lambda p:p.requires_grad,model.parametres()),lr=0.001)
# I WILL LOAD THE MODEL INTO CUDA ,Then create a dataset and dataloader with the use of torch.text
model.cuda()
X_train =torch.tensor(X_train,dtype=torch.long).cuda()
X_test = torch.tensor(X_test,dtype=torch.long).cuda()
X_cv = torch.tensor(X_test,dtype=torch.long).cuda()
y_cv = torch.tensor(y_test,dtype=torch.long).cuda()

# CREATING A DATASET AND DATALOADER
from torch.utils.data import Tensor_Dataset,Tensor_Dataloader
train = torch.utils.data.Tensor_Dataset(X_train,y_train)
valid = torch.utils.data.Tensor_Dataset(X_test,y_test)
# creating a dataloader
train_loader = torch.utils.Dataloader(train,batch_size=batch_size,shuffle=True)
valid_loader = torch.utils.Dataloader(valid,batch_size=batch_size,shuffle=False)




# Training and Testing Loop

In [None]:
train_loss = []
valid_loss = []
for epoch in range(n_epochs):
    model.train()
    avg_loss = 0.0
    for i,(x.batch,y.batch) in enumerate(train_loader):
        y_pred = model(x.batch)
        loss =loss_fn(y_pred,y.batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss+=loss.item()/len(train_loader)
        model.eval()
        with torch.inference_mode():
            for i ,(x.batch,y.batch) in enumerate(valid_loader):
                avg_val_loss = 0
                y_pred = model(x.batch)
                valid_loss = loss_fn(y_pred,y.batch)
                train_loss.append(loss)
                valid_loss.append(valid_loss)
                print(f"Epoch{epoch}|train_loss:{train_loss}|valid_loss:{valid_loss})

# PREDICTIVE MODEL

In [None]:
def predictor(text):
    text = preprocess_text(text)
    text = tokenizer.texts_to_sequence(text)
    text = pad_sequences(text,maxlen=max_len)
    text = torch.tensor(text,dtype=torch.long).cuda()
    pred = model(text).detach()
    pred = F.softmax(pred).cpu().numpy()
    pred = pred.argmax(axis=1)
    pred =le.classes_[pred]
    return pred[0]