# Exam

Develop a model for predicting review rating.  
**Binary classification:**  
**positive class: target = 5**   
**negative class: target = 1,2,3,4**  
Score: **binary F1**  
You are forbidden to use test dataset for any kind of training.  
Remember proper training pipeline.  
If you are not using default params in the models, you have to use some validation scheme to justify them. 

Use `random_state` or `seed` params - your experiment must be reprodusible.


### 1 baseline = 0.720
### 2 baseline = 0.745


**QUESTION 2:** What is the interpretation of Laplace smoothing in n-gram language model?

To reduce zero-probabilities for n-grams that we do not have in our train data, we act like we have this n-gram at least one time in our train data. We add some delta (1, for example) to all the counts when calculating the probability.

In [0]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

SEED = 23
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [0]:
!wget -O data.zip https://github.com/thedenaas/hse_seminars/blob/master/2019/exam/exam_data.zip?raw=true
!unzip '/content/data.zip'

In [3]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

df_train['target'] = (df_train['target'] == 5).astype(np.int)
df_test['target'] = (df_test['target'] == 5).astype(np.int)

df_train.shape

(48192, 3)

In [4]:
df_train = df_train.drop(['title'], axis=1)
df_test = df_test.drop(['title'], axis=1)
df_train.head()

Unnamed: 0,review,target
0,"The staff was very friendly, the breakfast ver...",1
1,Excellent service - very approachable and prof...,0
2,Really a top notch place to spend a day at the...,1
3,"a little noisy, there was a false fire alarm a...",0
4,Place had too many animals and I'm allergic to...,0


In [0]:
count_vect = TfidfVectorizer(ngram_range=(1, 5))
X_train = count_vect.fit_transform(df_train.review)
X_test = count_vect.transform(df_test.review)

clf = LinearSVC().fit(X_train, df_train.target)
predicted = clf.predict(X_test)

binary_f1 = f1_score(predicted, df_test.target, average='binary')
binary_f1

0.7362463210323749

In [0]:
from sklearn.model_selection import train_test_split

X = df_train['review']
y = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [0]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('clf', LinearSVC())])

In [0]:
from sklearn.model_selection import GridSearchCV

parameters = {'tfidf__use_idf': (True, False),
              'tfidf__ngram_range': ((1,3), (1,4),(1,6)),
              'tfidf__max_df': (0.2, 0.8, 1),
               'clf__tol': (1e-4, 1e-3, 1e-5)}
              
grid_search = GridSearchCV(text_clf, parameters, cv=3, n_jobs=-1, scoring='f1')
grid_search = grid_search.fit(df_train.review, df_train.target)

In [0]:
model = grid_search.best_estimator_

predicted = model.predict(df_test.review)
binary_f1 = f1_score(predicted, df_test.target, average='binary')
binary_f1

# Torch

In [5]:
import spacy
from spacy.symbols import ORTH
import re
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, KeyedVectors

import torch
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import Vectors

SEED = 42
np.random.seed(SEED)

import nltk
nltk.download('stopwords')

spacy_en = spacy.load('en')
spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
spacy_en.tokenizer.add_special_case("didn't", [{ORTH: "did"}, {ORTH: "not"}]) #adding special case so that tokenizer("""don't""") != 'do'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [0]:
def tokenizer(text):
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

In [8]:
review_tokenized = list(df_train['review'].apply(tokenizer))
model = Word2Vec(review_tokenized, size=100) #building emb of size 100
model_weights = torch.FloatTensor(model.wv.vectors)
model.wv.save_word2vec_format('pretrained_embeddings')
vectors = Vectors(name='pretrained_embeddings', cache='./') #and saving the weights to build vocab later

  0%|          | 0/10888 [00:00<?, ?it/s]Skipping token b'10888' with 1-dimensional vector [b'100']; likely a header
 77%|███████▋  | 8418/10888 [00:00<00:00, 17042.31it/s]


In [0]:
df_train.to_csv('data_train.csv', index=False)
df_test.to_csv('data_test.csv', index=False)

In [0]:
MAX_VOCAB_SIZE = 50000

REVIEW = Field(sequential=True, include_lengths=False, batch_first=True, fix_length=50, tokenize=tokenizer, pad_first=True, lower=True) 
TARGET = LabelField(dtype=torch.float)

train = TabularDataset('/content/data_train.csv', 
                                format='csv', fields=[('review', REVIEW),('target', TARGET)], 
                                skip_header=True)

test = TabularDataset('/content/data_test.csv', 
                                format='csv', fields=[('review', REVIEW),('target', TARGET)], 
                                skip_header=True)

REVIEW.build_vocab(train, min_freq=2, vectors=vectors,
                   unk_init = torch.Tensor.normal_, max_size=MAX_VOCAB_SIZE)
TARGET.build_vocab(train)
vocab = REVIEW.vocab

In [48]:
print('Vocab size:', len(REVIEW.vocab.itos))
REVIEW.vocab.itos[:10]

Vocab size: 17383


['<unk>', '<pad>', 'the', 'be', 'and', 'a', 'to', 'in', 'we', 'i']

In [49]:
print(train[0].review)
print(train[0].target)

['the', 'staff', 'be', 'very', 'friendly', 'the', 'breakfast', 'very', 'nice', 'extremely', 'comfortable', 'bed']
1


In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels, padding_idx):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.embedding.weight.data.copy_(vocab.vectors)        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k) for k in kernels])
        #self.dropout = nn.Dropout(dropout)        
        self.fc = nn.Linear(hidden_size * len(kernels), 1)

        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.relu(z)
            z = F.max_pool1d(z, kernel_size=z.size(2)).squeeze(2)
            concatenated.append(z)
            
        x = torch.cat(concatenated, 1)
        #x = self.dropout(x)
        x = self.fc(x)
        return x

In [0]:
train, valid = train.split(0.90)

In [0]:
def create_model(batch_size, hidden_size, kernels):
    """
    Функция определяет модель по заданным гиперпараметрам и возвращает итераторы с заданным batch_size, а также оптимайзер и критерий
    """
    torch.cuda.empty_cache()    

    padding_idx = REVIEW.vocab.stoi[REVIEW.pad_token]
    
    model = MyModel(len(REVIEW.vocab.itos),
                    embed_size=100,
                    hidden_size=hidden_size,
                    kernels=kernels,
                    padding_idx = padding_idx
                )
    model.to(device)

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train, valid, test),
        batch_sizes=(batch_size, batch_size, batch_size),
        shuffle=True,
        sort_key=lambda x: len(x.review),
        device=device,
    )

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()
    criterion.to(device)
    return model, train_iterator, valid_iterator, test_iterator, optimizer, criterion

In [0]:
def f1_scoring(preds, y):
    preds = torch.round(torch.sigmoid(preds)).cpu().detach().numpy()
    binary_f1 = f1_score(y.cpu().detach().numpy(), preds, average='binary')
    return binary_f1

In [0]:
def test_model(model, test_iterator):
    test_acc = []

    with torch.no_grad():
        for item in test_iterator:
            x = item.review
            # print(x.size())
            y = item.target
            preds = model(x).squeeze(1)
            test_acc.append(f1_scoring(preds, y))
    test_acc = np.mean(test_acc) 
    return print('Test F1_binary: {}'.format(np.mean(test_acc)))

In [0]:
def train_cnn(model, train_iterator, test_iterator, criterion, device, n_epochs=20):
    
    history = []

    for epoch in range(n_epochs):
        train_loss = []
        train_acc = []
        model.train()

    
        for item in tqdm(train_iterator):
            x = item.review
            y = item.target
            optimizer.zero_grad()
            preds = model(x).squeeze(1)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.data.detach().item())
            train_acc.append(f1_scoring(preds, y))

        train_loss = np.mean(train_loss)
        train_acc = np.mean(train_acc)

        model.eval()
        test_model(model, test_iterator)


        print('Epoch: {}. Train loss: {:.3f}. Train accuracy: {:.3f}.'.format(
            epoch, train_loss, train_acc))        
        
        history.append({
            'epoch': epoch,
            'train_loss': train_loss,
            'train_acc': train_acc,
        })

        if epoch % 5 == 0:
            torch.save(model.state_dict(), '/content/model_test')

    return history

In [0]:
from tqdm import tqdm
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

In [0]:
model , train_iterator,  valid_iterator, test_iterator, optimizer,criterion = create_model(64, 128, [2,3,4])
model.embedding.weight.data.copy_(REVIEW.vocab.vectors);

In [81]:
clean_tqdm()
history = train_cnn(model, train_iterator,test_iterator,
          criterion,  device='cpu', n_epochs=11)

  0%|          | 0/678 [00:00<?, ?it/s]


AttributeError: ignored

In [41]:
test_f1 = test_model(model,valid_iterator)
test_f1

torch.Size([64, 2])


RuntimeError: ignored