## Question 10
Seq2seq models are trained by maximizing the likelihood of next token given: the output from the previous node of the decoder, depending on output of the previous state and the input summary while in inference each word is generated sequentially only based on previously generated words (previous LSTM decoder step)

## Var 2
Develop a model for predicting review rating.
Binary classification:
positive class: target = 5
negative class: target = 1,2,3,4
Score: binary F1
You are forbidden to use test dataset for any kind of training.
Remember proper training pipeline.
If you are not using default params in the models, you have to use some validation scheme to justify them.

Use random_state or seed params - your experiment must be reprodusible.

## 1 baseline = 0.720
## 2 baseline = 0.745

In [0]:
import pandas as pd
import numpy as np
import sklearn
import spacy
from sklearn.base import ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import f1_score

SEED=1337
np.random.seed(SEED)

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['target'] = (df_train['target'] == 5).astype(np.int)
df_test['target'] = (df_test['target'] == 5).astype(np.int)

df_train.shape, df_test.shape

((48192, 3), (5355, 3))

In [3]:
df_train

Unnamed: 0,review,title,target
0,"The staff was very friendly, the breakfast ver...",Walker Gem,1
1,Excellent service - very approachable and prof...,Excellent Service,0
2,Really a top notch place to spend a day at the...,"Good location, warm and friendly staff",1
3,"a little noisy, there was a false fire alarm a...","nice hotel,",0
4,Place had too many animals and I'm allergic to...,Experience,0
...,...,...,...
48187,"A friend of mine always books the cheapest, ba...","Comfy, cozy but oh, so grand!",0
48188,Stayed here with my family over Spring Break i...,Great location and price for family lodging.,0
48189,One word AWFUL and the pool was closed,Good hotel in a quiet part of Memphis,0
48190,Never will stay here again. Dirty towels shirt...,Filthy,0


In [0]:
X_train = df_train['review']
y_train = df_train['target']

In [0]:
tfidf = TfidfVectorizer(max_df=0.5, min_df=10)
X_train = tfidf.fit_transform(X_train)

test_data = df_test
X_test = tfidf.transform(test_data['review'])
y_test = test_data['target']

In [6]:
def sigmoid(z):
    return 1.0/(1.0 + np.e**(-z))

class LogRegNumpy(ClassifierMixin):
    def __init__(self, llambda=1, lr=0.1, batch_size=32, n_epochs=100):
        
        self.w = None
        self.b = 0
        self.llambda = llambda
        self.n_epochs = n_epochs
        self.lr = lr
        self.history = []
        self.batch_size = batch_size

    def fit(self, X, y):
        self.w = np.random.randn(X.shape[1])
        self.b = 0
        
        for epoch in range(self.n_epochs):
            
            # random permutation over indices of dataset
            batch_indices = np.random.permutation(len(y))
            
            for j in range(0, len(y), self.batch_size):
                batch_idx = batch_indices[j:j+32]
                batch_X = X[batch_idx]
                batch_y = y[batch_idx]
                
                # forward pass
                # <TODO> [1 point] calculate batch loss
                
                z = batch_X @ self.w + self.b
                batch_y_hat = sigmoid(z)
                
                loss = -np.mean(batch_y * np.log(batch_y_hat) + (1 - batch_y) * np.log(1 - batch_y_hat)) + 0.5*self.llambda * self.w.T @ self.w
#                 print(loss)
                # backward pass
                # <TODO> [2 points] calculate batch gradients 
                grad_z = sigmoid(z) - batch_y
                grad_w = batch_X.T @ (batch_y_hat - batch_y) * (1.0/self.batch_size) + self.llambda * self.w
                grad_b = np.mean(grad_z)

                # SGD optimization step
                # <TODO> [1 point]
                self.w = self.w - self.lr * grad_w
                self.b = self.b - self.lr * grad_b
                
                self.history.append(loss)
        
        return self 
    
    def predict_proba(self, X):
        # <TODO> [1 point] calculate p(y=1 | x)
        p = sigmoid(X @ self.w + self.b)
        return p
    
    def predict(self, X):
        return (self.predict_proba(X) > 0.5).astype(np.int)
    
    
model = LogRegNumpy(llambda = 0.001, lr=3, batch_size=128, n_epochs=100)
model.fit(X_train, y_train)
print('auc', metrics.roc_auc_score(y_test, model.predict_proba(X_test)))

auc 0.814824124309457


In [0]:
from sklearn.model_selection import GridSearchCV

def train_valid_pipeline(model, X_train, y_train, X_test, y_test, params):
  model.fit(X_train, y_train)
  print("f1_score on train: ", f1_score(y_train, model.predict(X_train)))

  valid = GridSearchCV(model, param_grid = params, cv = 5)
  valid.fit(X_train, y_train)
  print("f1_score after regularization: ", f1_score(y_train, valid.predict(X_train)))
  print("Best params: ", valid.best_params_)
  model = valid.best_estimator_

  print("f1_score on test: ", f1_score(y_test, model.predict(X_test)))

In [8]:
hyper_param = {'dual' : [True, False], 'C': np.arange(.01,1,.01), 'solver' : ['liblinear'], 'random_state': [50, 43, 25] }
train_valid_pipeline(model, X_train, y_train, X_test, y_test, hyper_param)

f1_score on train:  0.14621434222160176


TypeError: ignored