In [None]:
import os
import re
import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

'''from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
'''
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
yelp_business = pd.read_json('../data/yelp_dataset/business.json', lines=True)
yelp_business.fillna('NA', inplace=True)
# we want to make sure we only work with restaurants -- nothing else
restaurants = yelp_business[yelp_business['categories'].str.contains('Restaurants')]
print('Number of all businesses: ',yelp_business.shape[0])
print(f"Shape of restaurants dataset{restaurants.shape}")

In [None]:
yelp_business.head()

In [None]:
restaurants.head()

Now we bring the reviews and perform some preprocessing on those reviews..

In [None]:
yelp_review_iter = pd.read_json('../data/yelp_dataset/review.json', chunksize=100000, lines=True)


Because reviews are too big, we will read them in chunks, and make sure we delete reviews of places that are not in our list of businesses filtered earlier. Note here we choose 5 chunks, but we could have chosen any number (larger numbers will give MemoryError later on).

In [None]:
yelp_review = pd.DataFrame()
i=0
for df in yelp_review_iter:
    
    df = df[df['business_id'].isin(restaurants['business_id'])]
    print(df.shape)
    yelp_review = pd.concat([yelp_review, df])
    i=i+1
    print(i)
    if i==70: break

In [None]:
yelp_review.shape

Also make sure we only get businesses that already show up in our review list and delete the rest.

In [None]:
import pickle
yelp_review.to_pickle("pickled_reviews.pickle")


In [None]:
rest_reviews =pd.read_pickle("pickled_reviews.pickle")

In [None]:
yelp_business = yelp_business[yelp_business['business_id'].isin(rest_reviews['business_id'])]

In [None]:
print('Final businesses shape: ', yelp_business.shape)
print('Final review shape: ', rest_reviews.shape)

In [None]:
rest_reviews.head()

In [None]:
rest_reviews['funny'].describe()

In [None]:
pd.options.display.max_seq_items = 2000
print(rest_reviews[rest_reviews['funny']==1290][['business_id', 'text']])

In [None]:
rest_reviews.loc[1331304,'text']

In [None]:
# Check:
print( (rest_reviews['funny']>4).mean())
print(f"Number of funny reviews:{(rest_reviews['funny']>4).sum()}")
#print(rest_reviews['fun_bin'].mean())

In [None]:
75269/4201684

In [None]:
rest_reviews['fun_bin']=rest_reviews['funny'].apply(lambda x: 1 if x>4 else 0)

In [None]:
print(rest_reviews['fun_bin'].mean())

#### Getting a df with funny reviews

In [None]:
rest_reviews_fun = rest_reviews[rest_reviews['fun_bin']==1]

In [None]:
rest_reviews_fun.shape

In [None]:
rest_reviews_fun.drop_duplicates(subset= 'text', inplace=True)

In [None]:
rest_reviews_fun.shape

### Sampling not funny reviews


In [None]:
rest_reviews_not_fun = rest_reviews[rest_reviews['fun_bin']==0]

In [None]:
idx = rest_reviews_not_fun.index.values

In [None]:
len(idx)

In [None]:
# Balancing the classes: getting the same number of not funny reviews as funny
#random_hotels = np.random.choice(neg_activity_df["hotel"].unique(), len(neg_activity_df))
random_idx = np.random.choice(idx,rest_reviews_fun.shape[0])

In [None]:
len(random_idx)

In [None]:
rest_reviews_not_fun = rest_reviews_not_fun.loc[random_idx,:].copy()

In [None]:
rest_reviews_not_fun.shape

In [None]:
reviews_final = pd.concat([rest_reviews_fun, rest_reviews_not_fun])

In [None]:
reviews_final.shape

In [None]:
reviews_final.to_csv("../data/yelp_dataset/balanced_reviews.csv")

### Reading the reviews[](http://)

In [None]:
reviews_final = pd.read_csv("../input/yelp-reviews/balanced_reviews.csv")

In [None]:
reviews_final.head()

In [None]:
df_reviews = reviews_final[['funny','text', 'fun_bin']]

In [None]:
pd.set_option('display.max_colwidth', -1)
df_reviews.head()

### Data pre-processing

#### Goals
- Keep punctuation
- Split by ".", "!" to account for misspeling (like "Hi!I went to...")
- Try TF-IDF?

In [None]:
#!pip install -U spacy[cuda92]

In [None]:
#!pip install spacymoji

In [None]:
import pandas as pd
import numpy as np
#import nltk
#from nltk.corpus import stopwords
#from nltk.stem import SnowballStemmer

from string import punctuation


import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 
#from spacymoji import Emoji

In [None]:

re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

#nlp = spacy.load("en")
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

def clean_text(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
   # text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=\(\)]", " ", text) # keep punctuatuin, numnbers and letters
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " . ", text) #Add space to the dot
    text = re.sub(r"!", " ! ", text) #Add space to the exclamation sign
    text = re.sub(r":", " :", text) #Add space before : sign
    text = re.sub(r";", " ;", text) #Add space before ; sign
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    #text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    #text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    # find emojis
    emoji_list = []
    '''
    for word in text.split():
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)
    emoji_list'''
    #text = text.split()

    return text

my_tok = spacy.load('en')
#emoji = Emoji(my_tok)
#my_tok.add_pipe(emoji, first=True)
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(clean_text(x))]

def remove_stop_words(tokens): return [tok for tok in tokens if tok not in spacy_stopwords]

In [None]:
text = "I'm soooo excited!!!!!This is 10000% the best place on earth:))))) ðŸ˜ƒ..."

In [None]:
text

In [None]:
clean_text(text)

In [None]:
spacy_tok(clean_text(text))

In [None]:
text2 = "I also ordered a jade chicken quesadilla on the side.\n\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a 'C' grade lol ðŸ˜ƒ"




In [None]:
remove_stop_words(spacy_tok(clean_text(text2)))

### Building a vocabulary

In [None]:
counts = Counter()
for sent in df_reviews['text']:
    try:
        counts.update(remove_stop_words(spacy_tok(sent)))
    except:
        pass

In [None]:
counts

### Vocabulary

In [None]:
# Vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
# WHat is the 99% quantile of  length of the sentence?

df_reviews['len_text'] = df_reviews['text'].apply(lambda x: len(x.split()))


In [None]:
df_reviews['len_text'].quantile(0.95)

In [None]:
# note that spacy_tok takes a while run it just once
def encode_sentence(sent, vocab2index, N=500, padding_start=True):
    "Encoding a sentence adding padding"
    x = remove_stop_words(spacy_tok(sent))
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

### Splitting into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 X_train, X_valid, y_train, y_valid = train_test_split(df_reviews['text'], df_reviews['fun_bin'], test_size=0.33, random_state=42)
    

In [None]:
y_train.reset_index(inplace=True, drop=True)

In [None]:
X_train.reset_index(inplace=True, drop=True)
X_valid.reset_index(inplace=True, drop=True)

In [None]:
y_valid.reset_index(inplace=True, drop=True)

In [None]:
X_train.shape

In [None]:
y_train.shape

### Writing a dataset

In [None]:
class YelpDataset(Dataset):
    def __init__(self, df, y, N=400, padding_start=True):
        self.df = df
        self.X = [encode_sentence(sent, vocab2index, N, padding_start) for sent in self.df]
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]


In [None]:
train_ds =  YelpDataset(X_train, y_train, padding_start=False)
valid_ds =  YelpDataset(X_valid, y_valid, padding_start=False)


In [None]:
neg=[]
i=0
for x,s,y in train_ds:
    if s <=0:
        neg.append(i)
    i+=1

In [None]:
neg

In [None]:
X_train.drop(index = 99017, inplace=True)
y_train.drop(index = 99017, inplace=True)
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)

In [None]:
X_train.shape, y_train.shape

In [None]:
train_ds =  YelpDataset(X_train, y_train, padding_start=False)

In [None]:
neg=[]
i=0
for x,s,y in valid_ds:
    if s <=0:
        neg.append(i)
    i+=1

In [None]:
train_ds[100834]

In [None]:
train_ds[0]

In [86]:
len(y_train)

100835

In [87]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [88]:
class LSTMV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMV0Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [89]:
def train_epocs_v0(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [90]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [None]:
vocab_size = len(words)
print(vocab_size)
model_v0 = LSTMV0Model(vocab_size, 50, 50).cuda()

In [None]:
train_epocs_v0(model_v0, epochs=30, lr=0.01)

In [None]:
train_epocs_v0(model_v0, epochs=30, lr=0.005)

### Model with variable length

In [91]:
# dataset with padding at the end
train_ds_2 =  YelpDataset(X_train, y_train, padding_start=True)
valid_ds_2 =  YelpDataset(X_valid, y_valid, padding_start=True)


In [92]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s.float(), 0,descending=True) # s is the length of the sentence. Sort these lengths
        s = s.numpy().tolist() # 
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True) # We want LSTM to forget the padding, but in order to apply 
        #ordering mini batches withtin the model
        out_pack, (ht, ct) = self.lstm(x_pack) 
        out = self.linear(ht[-1]) # Problem here is that output is not sorted! 
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out) # scatter_ is undoing the sorting with the given sorting index
        # kind of sorting back with the original indexing
    
    
    

In [93]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [94]:
def val_metrics(model, val_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in val_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [95]:
batch_size = 5000
train_dl = DataLoader(train_ds_2, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds_2, batch_size=batch_size)

In [96]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 50).cuda()

133541


In [97]:
train_epocs(model, epochs=30, lr=0.01)

train loss 0.631 val loss 0.655 and val accuracy 0.632
train loss 0.627 val loss 0.606 and val accuracy 0.685
train loss 0.607 val loss 0.582 and val accuracy 0.731
train loss 0.503 val loss 0.549 and val accuracy 0.749
train loss 0.467 val loss 0.551 and val accuracy 0.758
train loss 0.427 val loss 0.541 and val accuracy 0.768
