In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import os
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import cv2
import multiprocessing as mp
import joblib

train_on_gpu = torch.cuda.is_available()
if torch.cuda.device_count() > 1:
    multiple_gpus = True
    
print(f"Using {torch.cuda.device_count()} GPUs")

torch.manual_seed(0)

Using 4 GPUs


<torch._C.Generator at 0x7f12b06fb330>

In [2]:
df = pd.read_csv('../data/goodreads-best-books/book_data.csv')

def get_first_obs(g):
    return g.split('|')[0]

df = df.iloc[0:20000]
df['genres'].fillna('UNK', inplace=True)
df['genre'] = df['genres'].apply(get_first_obs)
df['author'] = df['book_authors'].apply(get_first_obs)
df.drop(['genres', 'book_authors', 'book_edition'], axis=1, inplace=True)
df.reset_index(inplace=True)

classes = (df['genre'].value_counts()).index[df['genre'].value_counts() > 5]
classes = [c for c in classes if c != 'UNK']

df = df[df['genre'].isin(classes)]
le = LabelEncoder()
df['genre'] = le.fit_transform(df['genre'])
joblib.dump(le, '../models/genre_encoder.sklearn')
classes = df.genre.unique()

df['index'] = df['index'].astype(str)+'.jpg'
df.reset_index(drop=True, inplace=True)

In [3]:
df.shape

(18915, 12)

In [4]:
train_transforms = transforms.Compose([transforms.ToPILImage(),
                                       transforms.Resize((224, 224)),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])

test_transforms = transforms.Compose([transforms.ToPILImage(),
                                     transforms.Resize((224, 224)),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])





In [5]:
bad_images = []
for i in tqdm(range(df.shape[0])):
    try:
        img = cv2.imread(f'../data/goodreads-best-books/images/images/{df["index"][i]}')
        img = train_transforms(img)
    except:
        bad_images.append(df['index'][i])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=18915), HTML(value='')))




In [6]:
df = df[~df['index'].isin(bad_images)].reset_index(drop=True)
df.to_csv('../data/clean_book_data.csv', index=False)
df = df[['index', 'genre']]

In [7]:
df.tail()

Unnamed: 0,index,genre
18832,19995.jpg,10
18833,19996.jpg,44
18834,19997.jpg,43
18835,19998.jpg,33
18836,19999.jpg,43


In [14]:
train_df, test_df = train_test_split(df, stratify = df.genre, test_size=.4, random_state=0)
val_df, test_df = train_test_split(test_df, stratify = test_df.genre, test_size=.5, random_state=0)

In [15]:
class DataSet(torch.utils.data.Dataset):
    def __init__(self, labels, data_directory, transform=None):
        super().__init__()
        self.labels = labels.values
        self.data_dir = data_directory
        self.transform=transform
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        name, label = self.labels[index]
        img_path = os.path.join(self.data_dir, name)
        img = cv2.imread(img_path)
        
        if self.transform is not None:
            img = self.transform(img)
        return img, label

In [16]:
batch_size = 196

train_data = DataSet(train_df,'../data/goodreads-best-books/images/images/', transform = train_transforms)
val_data = DataSet(val_df,'../data/goodreads-best-books/images/images/', transform = test_transforms)
test_data = DataSet(test_df,'../data/goodreads-best-books/images/images/', transform = test_transforms)

train_data_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size,
                                                shuffle = True, num_workers=mp.cpu_count())
val_data_loader = torch.utils.data.DataLoader(val_data, batch_size = batch_size,
                                              shuffle = True, num_workers=mp.cpu_count())
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size,
                                              shuffle = True, num_workers=mp.cpu_count())

In [17]:
vgg = torchvision.models.vgg11(pretrained=False)
best_model = torchvision.models.vgg11(pretrained=False)
vgg.classifier[6] = nn.Linear(4096, len(classes))
best_model.classifier[6] = nn.Linear(4096, len(classes))

In [18]:
def train_model(model, train_data, val_data, epochs, loss, lr, model_name):
    optimizer = torch.optim.Adamax(model.parameters(), lr=lr)
    no_improvement=0
    best_loss = np.inf
    train_losses = []
    val_losses = []
    
    if train_on_gpu:
        model = model.cuda()
        
        if multiple_gpus:
            model = nn.DataParallel(model)
    
    for epoch in range(1, epochs+1):
        train_loss = 0
        val_loss = 0

        model.train()
        for images, labels in train_data:

            if train_on_gpu:
                images = images.cuda()
                labels = labels.cuda()

            optimizer.zero_grad()
            out = model(images)
            loss = criterion(out, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        model.eval()
        for images, labels in val_data:

            if train_on_gpu:
                images = images.cuda()
                labels = labels.cuda()

            out = model(images)
            loss = criterion(out, labels)

            val_loss += loss.item()

        train_loss = train_loss/len(train_data_loader.dataset)
        val_loss = val_loss/len(val_data_loader.dataset)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(epoch, train_loss, val_loss))

        #Saving the weights of the best model according to validation score
        if val_loss < best_loss:
            no_improvement = 0
            best_loss = val_loss
            print('Improved Model Score - Updating Best Model Parameters...')
            if multiple_gpus:
                torch.save(model.module.state_dict(), f'../models/{model_name}.pt')
            else:
                torch.save(model.state_dict(), f'../models/{model_name}.pt')
        else:
            no_improvement +=1
            if no_improvement==5:
                print('No Improvement for 5 epochs, Early Stopping')
                break
                
    joblib.dump(train_losses, '../data/no_pretrain_train_losses.pkl')
    joblib.dump(val_losses, '../data/no_pretrain_val_losses.pkl')

In [19]:
learning_rate = .0003
criterion = nn.CrossEntropyLoss()

train_model(vgg, train_data_loader, val_data_loader, 200, criterion, learning_rate, 'vgg_no_pretraining')

Epoch: 1 	Training Loss: 0.016574 	Validation Loss: 0.016361
Improved Model Score - Updating Best Model Parameters...
Epoch: 2 	Training Loss: 0.015718 	Validation Loss: 0.016124
Improved Model Score - Updating Best Model Parameters...
Epoch: 3 	Training Loss: 0.015588 	Validation Loss: 0.015934
Improved Model Score - Updating Best Model Parameters...
Epoch: 4 	Training Loss: 0.015464 	Validation Loss: 0.015946
Epoch: 5 	Training Loss: 0.015365 	Validation Loss: 0.015902
Improved Model Score - Updating Best Model Parameters...
Epoch: 6 	Training Loss: 0.015205 	Validation Loss: 0.015694
Improved Model Score - Updating Best Model Parameters...
Epoch: 7 	Training Loss: 0.015100 	Validation Loss: 0.015545
Improved Model Score - Updating Best Model Parameters...
Epoch: 8 	Training Loss: 0.014914 	Validation Loss: 0.015542
Improved Model Score - Updating Best Model Parameters...
Epoch: 9 	Training Loss: 0.014761 	Validation Loss: 0.015355
Improved Model Score - Updating Best Model Parameter

In [20]:
best_model.load_state_dict(torch.load('../models/vgg_no_pretraining.pt'))

best_model = best_model.cuda()
best_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_data_loader:
        if train_on_gpu:
            images = images.cuda()
            labels = labels.cuda()
        outputs = best_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
          
    print('Test Accuracy: {} %'.format(100 * correct / total))

Test Accuracy: 26.99044585987261 %
