In [2]:
import os
import re
import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

'''from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
'''
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
yelp_business = pd.read_json('../data/yelp_dataset/business.json', lines=True)
yelp_business.fillna('NA', inplace=True)
# we want to make sure we only work with restaurants -- nothing else
restaurants = yelp_business[yelp_business['categories'].str.contains('Restaurants')]
print('Number of all businesses: ',yelp_business.shape[0])
print(f"Shape of restaurants dataset{restaurants.shape}")

In [None]:
yelp_business.head()

In [None]:
restaurants.head()

Now we bring the reviews and perform some preprocessing on those reviews..

In [None]:
yelp_review_iter = pd.read_json('../data/yelp_dataset/review.json', chunksize=100000, lines=True)


Because reviews are too big, we will read them in chunks, and make sure we delete reviews of places that are not in our list of businesses filtered earlier. Note here we choose 5 chunks, but we could have chosen any number (larger numbers will give MemoryError later on).

In [None]:
yelp_review = pd.DataFrame()
i=0
for df in yelp_review_iter:
    
    df = df[df['business_id'].isin(restaurants['business_id'])]
    print(df.shape)
    yelp_review = pd.concat([yelp_review, df])
    i=i+1
    print(i)
    if i==70: break

In [None]:
yelp_review.shape

Also make sure we only get businesses that already show up in our review list and delete the rest.

In [None]:
import pickle
yelp_review.to_pickle("pickled_reviews.pickle")


In [None]:
rest_reviews =pd.read_pickle("pickled_reviews.pickle")

In [None]:
yelp_business = yelp_business[yelp_business['business_id'].isin(rest_reviews['business_id'])]

In [None]:
print('Final businesses shape: ', yelp_business.shape)
print('Final review shape: ', rest_reviews.shape)

In [None]:
rest_reviews.head()

In [None]:
rest_reviews['funny'].describe()

In [None]:
pd.options.display.max_seq_items = 2000
print(rest_reviews[rest_reviews['funny']==1290][['business_id', 'text']])

In [None]:
rest_reviews.loc[1331304,'text']

In [None]:
# Check:
print( (rest_reviews['funny']>4).mean())
print(f"Number of funny reviews:{(rest_reviews['funny']>4).sum()}")
#print(rest_reviews['fun_bin'].mean())

In [None]:
75269/4201684

In [None]:
rest_reviews['fun_bin']=rest_reviews['funny'].apply(lambda x: 1 if x>4 else 0)

In [None]:
print(rest_reviews['fun_bin'].mean())

#### Getting a df with funny reviews

In [None]:
rest_reviews_fun = rest_reviews[rest_reviews['fun_bin']==1]

In [None]:
rest_reviews_fun.shape

In [None]:
rest_reviews_fun.drop_duplicates(subset= 'text', inplace=True)

In [None]:
rest_reviews_fun.shape

### Sampling not funny reviews


In [None]:
rest_reviews_not_fun = rest_reviews[rest_reviews['fun_bin']==0]

In [None]:
idx = rest_reviews_not_fun.index.values

In [None]:
len(idx)

In [None]:
# Balancing the classes: getting the same number of not funny reviews as funny
#random_hotels = np.random.choice(neg_activity_df["hotel"].unique(), len(neg_activity_df))
random_idx = np.random.choice(idx,rest_reviews_fun.shape[0])

In [None]:
len(random_idx)

In [None]:
rest_reviews_not_fun = rest_reviews_not_fun.loc[random_idx,:].copy()

In [None]:
rest_reviews_not_fun.shape

In [None]:
reviews_final = pd.concat([rest_reviews_fun, rest_reviews_not_fun])

In [None]:
reviews_final.shape

In [None]:
reviews_final.to_csv("../data/yelp_dataset/balanced_reviews.csv")

### Reading the reviews[](http://)

In [3]:
reviews_final = pd.read_csv("../input/yelp-reviews/balanced_reviews.csv")

In [4]:
reviews_final.head()

Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,fun_bin
0,17,cHdJXLlKNWixBXpDwEGb_A,1,2015-04-01 16:30:00,7,6BnQwlxRn7ZuWdzninM9sQ,3,I love chinese food and I love mexican food. W...,1,JSrP-dUmLlwZiI7Dp3PQ2A,1
1,21,Mem13A3C202RzT53npn4NA,9,2017-05-13 10:41:43,6,IPw8yWiyqnfBzzWmypUHgg,5,If you are looking for the best pierogies in P...,9,5JVY32_bmTBfIGpCCsnAfw,1
2,62,SU56w479vUfFHsvmvQIf7A,6,2016-07-25 03:55:20,5,E4LqIZ7DJd_R4ZHSNKx4RQ,4,So good! They didn't make it to 5 stars due to...,7,DoRCeCcJbrsM2BiAKj3trA,1
3,126,tjAeaGdxf7I4xN9M7wGJNQ,4,2014-07-13 14:32:56,5,TaoaX7MqCujFRNaJBns2Sw,5,While the prices are a bit high for a make-you...,8,x37OyP--VEFE5p-xreplYA,1
4,246,FhIeCF6QrsLaRvAeu0oEPQ,4,2013-06-24 06:42:29,5,3Qc49B7dA0ONmCxrn5iwCQ,2,OVERALL: The food isn't good (I explain below)...,13,2k8OVAPxlXHsA5X6EIoQpQ,1


In [5]:
df_reviews = reviews_final[['funny','text', 'fun_bin']]

In [6]:
pd.set_option('display.max_colwidth', -1)
df_reviews.head()

Unnamed: 0,funny,text,fun_bin
0,7,"I love chinese food and I love mexican food. What can go wrong? A couple of things. First things first, this place is more of a ""rice bowl"" kind of place. I thought it was going to be more diverse as far as the menu goes, but its mainly rice bowls you get with different kinds of meats. The ordering was a little confusing at first, but one of the employees helped us out and I got the 2-item bowl and got the jade chicken and hengrenade chicken with all rice(jerk). I also ordered a jade chicken quesadilla on the side.\n\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a ""C"" grade lol. We waited for about 15 minutes or so and finally got our food. We took it to go and ate at our hotel room. \n\nMmmm... the food was just alright. The jade chicken was nothing special. It tasted like any generic chinese fast food orange chicken/sesame chicken variant. The hengrenade chicken, although was the less spicier version of the jerk chicken, was still pretty spicy for me. Just be warned the jerk chicken is super spicy. If you aren't sure, ask for a sample at the restaurant before ordering, but it was way too spicy for me. \n\nThe jade chicken quesadilla was decent, but nothing special. Just imagine orange chicken in between a tortilla and cheese. A friend of mine ordered a jade chicken burrito and we were confused when we pulled it out of the bag because it was literally the size of Mcdonald's apple pie. If you order the burrito, be warned that it's a burrito for gnomes and smurfs, but he said it was tasty. \n\nThey provide a snicker doodle sugar cookie for each meal and it was decent, again nothing special. \n\nNot gonna lie, the next day my stomach felt like a little mexican dude and chinese dude were wrestling and throwing molotov cocktails inside. I used the bathroom like 5 times. I don't recommend eating this place if you have a lot to do the next day.",1
1,6,"If you are looking for the best pierogies in Pittsburgh, this is your place. There are a few small tables outside but most of the business is carry out. Pierogies Plus wins Best Pierogies every year. Why? Because the owner is from Poland and she is making the real deal pierogies. The best part is that they are hand pinched by a group of older Polish and Hungarian women. \nThe biggest seller is potato and cheese but they sell many flavors. They are like plump pillows of softness. You can buy them buy the dozen. You can get them cold to take home and freeze or warm and ready to eat. The warm ones are served with butter and onions. It's definitely a comfort food. The best part is that they ship internationally. Yes, they are that good.",1
2,5,"So good! They didn't make it to 5 stars due to the prices are a bit high for the amount of food and the location is a bit unsavory. \nThe decor and atmosphere was surprisingly nice, from the outside I expected to be more run down inside. The staff was very nice. We were surprised how empty the dining room was for a Friday evening.\nWe got Vegetable Samosas to start then ordered Chicken Tikka Masala, Lamb Rogan Josh, rice and plain Naan. Our only complaint was the lamb could've been more tender but everything was flavorful and delicious. \nI would definitely go again if given the chance.",1
3,5,"While the prices are a bit high for a make-your-own pizza, the taste makes up for it. I love going to Seventh Street market, sitting Not Just Coffee and having a drink while waiting for delicious fresh made pizza from Pure. I've taken this to go as well as eaten inside the market, and I can say that the pizza doesn't do well reheated. So try to eat it fresh while there if possible.\n\nIf one of their specialty pizzas sounds good to you, go for it, as those are definitely a better deal for the amount of toppings you get for the money. I wanted what I wanted, though, so I ended up with a medium, thin crust, regular crust pizza with jalapenos, pepperoncini, onions and feta. It was pretty expensive at $2/topping = $20 med pizza. But it was delicious.\n\nThe arugula salad with goat cheese and lemon vinaigrette is to-die-for. I crave that dressing days later. So light and fresh but flavorful.",1
4,5,"OVERALL: The food isn't good (I explain below), but this place may still be worth locals' time (and more importantly money). Let me explain...\n\nThere are not many ""old"" restaurants in this town. We don't seem to value/frequent/patronize places that have been around putting out food for a long time. I think we should. Even when the food isn't show stopping. Why? This place has tremendous character and charm. There's an ""Old Western Vegas"" feel to Bob Taylor's. Established in 1955, it's the oldest restaurant in Las Vegas. Its a throwback to a rugged, carnivorous cowboy culture that has existed in this town for decades. And still exists. I did appreciate the slice of Vegas kitch that Bob Taylor's offers. \n\nFOOD ISSUES: So with all that charm how could this place go wrong? This place could be great. It really should be great. But they are not putting enough care into the food. I ordered the rib eye and asked for it to be medium rare. I was worried about it being overcooked and figured if a mistake was made, I'd be in the medium range. My instincts were correct. But the steak was closer to well done. In total, three steaks at our table were seriously overcooked. In a steakhouse. With a man tasked with grilling the steaks. Sigh. The fourth steak, smoked prime rib, was cooked properly. But the prime rib is cooked ahead if time, right? How was I prepared for this overcooked piece of meat? How did this only occasional red-meat-eater suspect that my steak would not be treated with attentive care? \n\nI'll tell you. When we walked in there was a large grill at the front of the restaurant with a number if steaks cooking on it. But the chef was not watching the meat. He wasn't even in front of the grill. He was nowhere to be seen as we walked through the doors. And there were at least 4 steaks cooking when we arrived. So I figured that my steak would receive the same lack of attention. \n\nI ordered a simple naked potato and a side salad to accompany my steak. Both were fine. But there is not much room to mess up a potato and iceberg, is there? People rave about the garlic bread and I think it's because the rest of the meal is so mediocre, that the cheesy bread becomes the highlight if the meal. It was just OK. The most inexperienced cook could make it at home with sourdough, butter, and three types of cheese. \n\nA few people in our party ordered the mushroom rice side dish and it was not good. After tasting it, I was grateful to have passed on this wet, mush. \n\nSERVICE: Our waitress was very attentive and responsive. She was more than willing to return the overcooked steaks. \n\nI won't be back, but I'm glad to have visited this historic spot.\n\nService: 4 stars\n\nKitch: 4 stars\n\nFood: 1 star",1


### Data pre-processing

#### Goals
- Keep punctuation
- Split by ".", "!" to account for misspeling (like "Hi!I went to...")
- Try TF-IDF?

In [7]:
#!pip install -U spacy[cuda92]

In [8]:
#!pip install spacymoji

In [9]:
import pandas as pd
import numpy as np
#import nltk
#from nltk.corpus import stopwords
#from nltk.stem import SnowballStemmer

from string import punctuation


import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 
#from spacymoji import Emoji

In [10]:

re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

#nlp = spacy.load("en")
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

def clean_text(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
   # text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=\(\)]", " ", text) # keep punctuatuin, numnbers and letters
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " . ", text) #Add space to the dot
    text = re.sub(r"!", " ! ", text) #Add space to the exclamation sign
    text = re.sub(r":", " :", text) #Add space before : sign
    text = re.sub(r";", " ;", text) #Add space before ; sign
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    #text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    #text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    # find emojis
    emoji_list = []
    '''
    for word in text.split():
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)
    emoji_list'''
    #text = text.split()

    return text

my_tok = spacy.load('en')
#emoji = Emoji(my_tok)
#my_tok.add_pipe(emoji, first=True)
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(clean_text(x))]

def remove_stop_words(tokens): return [tok for tok in tokens if tok not in spacy_stopwords]

In [11]:
text = "I'm soooo excited!!!!!This is 10000% the best place on earth:))))) 😃..."

In [12]:
text

"I'm soooo excited!!!!!This is 10000% the best place on earth:))))) 😃..."

In [13]:
clean_text(text)

'i am soooo excited ! ! ! ! ! this is 10000% the best place on earth :))))) 😃 . . . '

In [14]:
spacy_tok(clean_text(text))

['i',
 'am',
 'soooo',
 'excited',
 '!',
 '!',
 '!',
 '!',
 '!',
 'this',
 'is',
 '10000',
 '%',
 'the',
 'best',
 'place',
 'on',
 'earth',
 ':',
 ')',
 ')',
 ')',
 ')',
 ')',
 '😃',
 '.',
 '.',
 '.']

In [15]:
text2 = "I also ordered a jade chicken quesadilla on the side.\n\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a 'C' grade lol 😃"




In [16]:
remove_stop_words(spacy_tok(clean_text(text2)))

['ordered',
 'jade',
 'chicken',
 'quesadilla',
 '.',
 'gon',
 'na',
 'admit',
 'place',
 'looks',
 'kinda',
 'dirty',
 '.',
 'think',
 'arizona',
 'uses',
 'health',
 'department',
 'letter',
 'grade',
 'system',
 'like',
 'california',
 'judge',
 'looked',
 'inside',
 'c',
 'grade',
 'lol',
 '😃']

### Building a vocabulary

In [None]:
counts = Counter()
for sent in df_reviews['text']:
    try:
        counts.update(remove_stop_words(spacy_tok(sent)))
    except:
        pass

In [None]:
counts

### Vocabulary

In [None]:
# Vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
# WHat is the 99% quantile of  length of the sentence?

df_reviews['len_text'] = df_reviews['text'].apply(lambda x: len(x.split()))


In [None]:
df_reviews['len_text'].quantile(0.95)

In [None]:
# note that spacy_tok takes a while run it just once
def encode_sentence(sent, vocab2index, N=500, padding_start=True):
    "Encoding a sentence adding padding"
    x = remove_stop_words(spacy_tok(sent))
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

### Splitting into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 X_train, X_valid, y_train, y_valid = train_test_split(df_reviews['text'], df_reviews['fun_bin'], test_size=0.33, random_state=42)
    

In [None]:
y_train.reset_index(inplace=True, drop=True)

In [None]:
X_train.reset_index(inplace=True, drop=True)
X_valid.reset_index(inplace=True, drop=True)

In [None]:
y_valid.reset_index(inplace=True, drop=True)

In [None]:
X_train.shape

In [None]:
y_train.shape

### Writing a dataset

In [None]:
class YelpDataset(Dataset):
    def __init__(self, df, y, N=400, padding_start=True):
        self.df = df
        self.X = [encode_sentence(sent, vocab2index, N, padding_start) for sent in self.df]
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]


In [None]:
train_ds =  YelpDataset(X_train, y_train, padding_start=False)
valid_ds =  YelpDataset(X_valid, y_valid, padding_start=False)


In [None]:
neg=[]
i=0
for x,s,y in train_ds:
    if s <=0:
        neg.append(i)
    i+=1

In [None]:
neg

In [None]:
X_train.drop(index = 99017, inplace=True)
y_train.drop(index = 99017, inplace=True)
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)

In [None]:
X_train.shape, y_train.shape

In [None]:
train_ds =  YelpDataset(X_train, y_train, padding_start=False)

In [None]:
neg=[]
i=0
for x,s,y in valid_ds:
    if s <=0:
        neg.append(i)
    i+=1

In [None]:
train_ds[100834]

In [None]:
train_ds[0]

In [None]:
len(y_train)

In [None]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [None]:
class LSTMV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMV0Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
def train_epocs_v0(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [None]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [None]:
vocab_size = len(words)
print(vocab_size)
model_v0 = LSTMV0Model(vocab_size, 50, 50).cuda()

In [None]:
train_epocs_v0(model_v0, epochs=30, lr=0.01)

In [None]:
train_epocs_v0(model_v0, epochs=30, lr=0.005)

* ### Model with variable length

In [None]:
# dataset with padding at the end
train_ds_2 =  YelpDataset(X_train, y_train, padding_start=True)
valid_ds_2 =  YelpDataset(X_valid, y_valid, padding_start=True)


In [None]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s.float(), 0,descending=True) # s is the length of the sentence. Sort these lengths
        s = s.numpy().tolist() # 
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True) # We want LSTM to forget the padding, but in order to apply 
        #ordering mini batches withtin the model
        out_pack, (ht, ct) = self.lstm(x_pack) 
        out = self.linear(ht[-1]) # Problem here is that output is not sorted! 
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out) # scatter_ is undoing the sorting with the given sorting index
        # kind of sorting back with the original indexing
    
    
    

In [None]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [None]:
def val_metrics(model, val_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in val_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [None]:
batch_size = 5000
train_dl = DataLoader(train_ds_2, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds_2, batch_size=batch_size)

In [None]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 50).cuda()

In [None]:
train_epocs(model, epochs=50, lr=0.01)

In [None]:
train_epocs(model, epochs=30, lr=0.005)

In [None]:
### CNN with text

In [None]:
V = vocab_size 
D = 50

In [None]:
emb = nn.Embedding(vocab_size, 50)

In [None]:
x1 = emb(x.long())

In [None]:
x1.shape

In [None]:

x1 = x1.transpose(1,2)  # needs to convert x to (batch, embedding_dim, sentence_len)
x1.size()

In [None]:
conv_3 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=3)

In [None]:
x3 = conv_3(x1)


In [None]:
x3.size()

In [None]:
conv_4 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=4)
conv_5 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=5)

In [None]:
x4 = conv_4(x1)
x5 = conv_5(x1)
print(x4.size(), x5.size())

In [None]:
# 100 3-gram detectors
x3 = nn.ReLU()(x3)
x3 = nn.MaxPool1d(kernel_size = 398)(x3)
x3.size()

In [None]:
x4 = nn.ReLU()(x4)
x4 = nn.MaxPool1d(kernel_size = 397)(x4)
x4.size()

In [None]:

# 100 5-gram detectors
x5 = nn.ReLU()(x5)
x5 = nn.MaxPool1d(kernel_size = 396)(x5)
x5.size()

In [None]:

# concatenate x3, x4, x5
out = torch.cat([x3, x4, x5], 2)
out.size()

In [None]:

out = out.view(out.size(0), -1)
out.size()

In [None]:

class CNNModel(nn.Module):
    
    def __init__(self, vocab_size, hidden_dim):
        super(CNNModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
    
        self.conv_3 = nn.Conv1d(in_channels=hidden_dim, out_channels=100, kernel_size=3)
        self.conv_4 = nn.Conv1d(in_channels=hidden_dim, out_channels=100, kernel_size=4)
        self.conv_5 = nn.Conv1d(in_channels=hidden_dim, out_channels=100, kernel_size=5)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        x3 = F.relu(self.conv_3(x))
        x4 = F.relu(self.conv_4(x))
        x5 = F.relu(self.conv_5(x))
        x3 = nn.MaxPool1d(kernel_size = 398)(x3)
        x4 = nn.MaxPool1d(kernel_size = 397)(x4)
        x5 = nn.MaxPool1d(kernel_size = 396)(x5)
        out = torch.cat([x3, x4, x5], 2)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        return self.fc(out)

In [None]:
vocab_size = len(words)
print(vocab_size)
model = CNNModel(vocab_size, 300).cuda()

In [None]:
# testing the model

print(x.shape)
x = x.long().cuda()

In [None]:
y_hat = model(x)
y_hat.size()

In [None]:
def val_metrics(m, val_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in val_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [None]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [None]:
batch_size = 500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [1]:
model = CNNModel(vocab_size, 300).cuda()

NameError: name 'CNNModel' is not defined

In [None]:
train_epocs(model, epochs=30, lr=0.01 )