In [2]:
## Build Data set
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.utils.data import DataLoader
import torch.nn.functional as F


In [3]:
os.chdir('../')
os.getcwd()

'/Users/markbrackenrig/Documents/2021_FEB/ADSI/assignment_2'

In [4]:
from src.data import make_dataset

In [5]:
data = pd.read_csv('data/raw/beer_reviews.csv')
subsample = data.sample(frac=0.8)

In [6]:
X,y,y_encoder,ohe,scaler = make_dataset.process_data(subsample)

columns dropped
brewery names encoded
scaled data


In [7]:
subsample.head(5)

Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste
512711,2.5,3.0,Irish Red Ale,3.0,3.5
827799,3.5,4.0,Eisbock,4.0,4.0
415174,4.0,4.0,Russian Imperial Stout,3.5,4.0
1577861,4.5,4.0,American IPA,4.5,4.0
352001,3.0,3.0,Witbier,3.0,3.0


In [8]:
import pickle as pkl

pkl.dump(ohe, open('models/ohe.pkl', 'wb'))
pkl.dump(y_encoder, open('models/y_encoder.pkl', 'wb'))
pkl.dump(scaler, open('models/scaler.pkl', 'wb'))


## Split into training and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.25, random_state = 42)

In [11]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

## Define Baseline Model

In [12]:
from src.models.null import NullModel

baseline_model = NullModel(target_type = 'classification')
y_base = baseline_model.fit_predict(y_train)

In [13]:
from src.models.performance import print_class_perf
print_class_perf(y_base, y_train, set_name='Training', average = 'weighted')

Accuracy Training: 0.07398624427829732
F1 Training: 0.010193732688050032


## Define Model Architecture

Just want to get a basic 1 layer model working before I try and get anything deeper working

In [14]:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(num_features = X_train.shape[1],class_num= 104)

In [51]:
class PytorchMultiClass(nn.Module):
    def __init__(self, num_features,class_num):
        super(PytorchMultiClass, self).__init__()
        
        self.layer_1 = nn.Linear(num_features, 104)
        self.layer_out = nn.Linear(104,class_num)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.dropout(F.relu(self.layer_1(x)), training=self.training)
        x = self.layer_out(x)
        return self.softmax(x)
    
    
model = PytorchMultiClass(num_features = X_train.shape[1],class_num= 104)

In [52]:
from src.models.pytorch import get_device
device = get_device()
print(device)
model.to(device)

cpu


PytorchMultiClass(
  (layer_1): Linear(in_features=1004, out_features=104, bias=True)
  (layer_out): Linear(in_features=104, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

## Train Model

In [53]:
N_EPOCHS = 30
BATCH_SIZE = 10000
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [54]:
from src.models.train_model import train_classification, test_classification

In [55]:

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 14.0%
	(valid)	|	Loss: 0.0005	|	Acc: 17.4%
Epoch: 1: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 18.7%
	(valid)	|	Loss: 0.0005	|	Acc: 20.5%
Epoch: 2: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 20.3%
	(valid)	|	Loss: 0.0005	|	Acc: 20.5%
Epoch: 3: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 20.4%
	(valid)	|	Loss: 0.0005	|	Acc: 20.7%
Epoch: 4: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 20.5%
	(valid)	|	Loss: 0.0005	|	Acc: 20.8%
Epoch: 5: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|	Loss: 0.0005	|	Acc: 20.6%
	(valid)	|	Loss: 0.0005	|	Acc: 20.9%
Epoch: 6: |█████████████████████████████████████████████████-| 98.7% Complete
	(train)	|

In [56]:

def predict(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    results = []
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        feature, target_class = feature.to(device), target_class.flatten().to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            # Make predictions
            output = model(feature)
            
        results.append({'prediction': output.argmax(1)[0],'target': target_class[0] })
    return results

In [57]:
test_predictions = predict(test_dataset, model=model, criterion=criterion, batch_size=1, device=device)

In [58]:
test_predictions = pd.DataFrame(test_predictions)

In [59]:
test_predictions.head(4)

Unnamed: 0,prediction,target
0,tensor(25),tensor(98.)
1,tensor(12),tensor(12.)
2,tensor(9),tensor(20.)
3,tensor(9),tensor(5.)


In [60]:
test_predictions["prediction"][0].int()

tensor(25, dtype=torch.int32)

In [61]:
from sklearn.metrics import confusion_matrix

In [62]:
pd.DataFrame(confusion_matrix(test_predictions['prediction'].astype(int), test_predictions['target'].astype(int))).to_csv('reports/confusion_matrix.csv')

## Save Model

In [63]:
torch.save(model.state_dict(), "models/beer_style_prediction.pt")

In [65]:
model2 = PytorchMultiClass(num_features = 1004,class_num= 104)
model2.load_state_dict(torch.load('models/beer_style_prediction.pt'))

<All keys matched successfully>

In [66]:
test_predictions = predict(test_dataset, model=model2, criterion=criterion, batch_size=1, device=device)

In [67]:
test_predictions = pd.DataFrame(test_predictions)
test_predictions.head(4)

Unnamed: 0,prediction,target
0,tensor(25),tensor(98.)
1,tensor(12),tensor(12.)
2,tensor(9),tensor(20.)
3,tensor(9),tensor(5.)


In [43]:
print(model2.state_dict())

OrderedDict([('layer_1.weight', tensor([[-0.0872, -0.3472, -1.8527,  ...,  2.2696,  2.3152,  8.0728],
        [ 0.9249, -0.9161, -0.1204,  ..., -5.3224, -4.3585, -2.9305],
        [ 0.1729,  2.0358, -1.4084,  ..., -1.4865,  5.9749, -2.6550],
        ...,
        [ 0.9383, -0.3152,  0.8588,  ..., -2.5122,  3.6488, -0.4877],
        [-1.0225,  0.1445, -0.2054,  ..., -4.0997,  8.0831, -0.0414],
        [-2.3319,  3.0059, -1.6737,  ...,  1.8239, -2.1176, -3.3638]])), ('layer_1.bias', tensor([-3.6998,  2.5781, -0.6749, -0.1455, -5.7757,  2.0219,  0.3723,  1.7660,
         2.2467,  1.0374,  0.1423, -1.5651, -3.2685,  0.6353, -0.5536, -1.7046,
        -4.4204,  2.8746, -0.4682, -2.6283, -3.8182, -0.1920,  0.8540, -5.4583,
         2.9375, -4.0122,  2.6254,  0.6579, -1.3517, -4.7528, -0.2017, -0.3514,
        -0.3790, -0.0262, -0.1056, -1.4354, -1.4846,  3.4382,  0.6050, -2.5827,
         0.9056, -3.0148,  0.5884,  1.0139, -0.6524, -7.4519, -3.1796, -5.4752,
         0.1723, -1.5168, -0.0100, 

In [54]:
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=1005, out_features=104, bias=True)
  (layer_2): Linear(in_features=104, out_features=104, bias=True)
  (layer_3): ReLU()
  (layer_out): Linear(in_features=104, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)


In [82]:
str(model)

'PytorchMultiClass(\n  (layer_1): Linear(in_features=1005, out_features=104, bias=True)\n  (layer_2): Linear(in_features=104, out_features=104, bias=True)\n  (layer_3): ReLU()\n  (layer_out): Linear(in_features=104, out_features=104, bias=True)\n  (softmax): Softmax(dim=1)\n)'

In [81]:
model.layer_1

Linear(in_features=1005, out_features=104, bias=True)

In [83]:
model.forward

<bound method PytorchMultiClass.forward of PytorchMultiClass(
  (layer_1): Linear(in_features=1005, out_features=104, bias=True)
  (layer_2): Linear(in_features=104, out_features=104, bias=True)
  (layer_3): ReLU()
  (layer_out): Linear(in_features=104, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)>