In [1]:
import pandas as pd

users = pd.read_csv('dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train_genres = pd.read_csv('dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test_genres = pd.read_csv('dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_train_genres['genre'] = movies_train_genres.genre.str.split('|')
movies_test_genres['genre'] = movies_test_genres.genre.str.split('|')

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

users['userid'] = users.index
users = users.reset_index(drop=True)

movies_train_genres.head()

Unnamed: 0_level_0,title,genre
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
1650,Washington Square (1997),[Drama]
185,"Net, The (1995)","[Sci-Fi, Thriller]"
1377,Batman Returns (1992),"[Action, Adventure, Comedy, Crime]"
3204,"Boys from Brazil, The (1978)",[Thriller]
1901,Dear Jesse (1997),[Documentary]


In [2]:
 # train set genres stats
genres = []
for genre in movies_train_genres.genre:
    genres.extend(genre)
genres = pd.Series(genres)
genres.value_counts()


Drama          1294
Comedy          953
Action          413
Thriller        386
Romance         377
Horror          268
Adventure       235
Sci-Fi          228
Children's      203
Crime           180
War             118
Musical         101
Documentary      97
Mystery          88
Animation        84
Fantasy          61
Western          54
Film-Noir        38
Name: count, dtype: int64

In [5]:
from sklearn.preprocessing import MinMaxScaler

user_ratings = ratings.merge(users, on='userid', how='left')
user_ratings.drop(['zip', 'timestamp'], axis=1, inplace=True)
user_ratings = pd.get_dummies(user_ratings, columns=['age', 'occupation', 'gender'])

scaler = MinMaxScaler()
user_ratings[['rating']] = scaler.fit_transform(user_ratings[['rating']])

user_ratings = user_ratings.groupby('userid').filter(lambda x: len(x) >= 20)

user_col = user_ratings.columns.tolist()

user_col.remove('movieid')
user_col.remove('userid')

user_ratings.head()

Unnamed: 0,userid,movieid,rating,age_1,age_18,age_25,age_35,age_45,age_50,age_56,...,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,gender_F,gender_M
0,1,1193,1.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,661,0.5,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,1,914,0.5,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,1,3408,0.75,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,1,2355,1.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [6]:
movies_train = pd.read_csv('dataset/movies_train.csv')
movies_test = pd.read_csv('dataset/movies_test.csv')
movies_train.drop(['id', 'title'], axis=1, inplace=True)
movies_test.drop(['id', 'title'], axis=1, inplace=True)

movies_train.head()

Unnamed: 0,movieid,genre,img_path,summary
0,1650,['Drama'],dataset/ml1m-images\1650.jpg,"""Washington Square"" is a 20th-century America..."
1,185,"['Sci-Fi', 'Thriller']",dataset/ml1m-images\185.jpg,"""Net"" is a 20th-century American play by Paul..."
2,1377,"['Action', 'Adventure', 'Comedy', 'Crime']",dataset/ml1m-images\1377.jpg,"""Batman Returns"" is a superhero film directed..."
3,3204,['Thriller'],dataset/ml1m-images\3204.jpg,"""Boys from Brazil"" is a horror thriller film..."
4,1901,['Documentary'],dataset/ml1m-images\1901.jpg,"""Dear Jesse"" is an American comedy drama fil..."


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer 

mlb = MultiLabelBinarizer()
movies_train_genres = movies_train_genres.join(pd.DataFrame(mlb.fit_transform(movies_train_genres.pop('genre')), columns=mlb.classes_, index=movies_train_genres.index))
movies_test_genres = movies_test_genres.join(pd.DataFrame(mlb.fit_transform(movies_test_genres.pop('genre')), columns=mlb.classes_, index=movies_test_genres.index))

movies_train_genres.drop(['title'], axis=1, inplace=True)
movies_test_genres.drop(['title'], axis=1, inplace=True)

movies_train.drop(['genre'], axis=1, inplace=True)
movies_test.drop(['genre'], axis=1, inplace=True)

movies_train = movies_train.join(movies_train_genres, on='movieid', how='left')
movies_test = movies_test.join(movies_test_genres, on='movieid', how='left')

y_cols = list(mlb.classes_)
movies_train.head(1)

Unnamed: 0,movieid,img_path,summary,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1650,dataset/ml1m-images\1650.jpg,"""Washington Square"" is a 20th-century America...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("nickmuchi/distilroberta-base-movie-genre-prediction")
model = AutoModelForSequenceClassification.from_pretrained("nickmuchi/distilroberta-base-movie-genre-prediction")
pipeline = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
# pipeline = pipeline('text-classification', model=model, tokenizer=tokenizer, top_k=None)

res = pipeline("I really liked the movie, it was great!")
res

In [None]:
from tqdm import tqdm
import numpy as np

def embeddings(summary):
    res = pipeline(summary)
    return np.array(res[0])

summary_embeddings_train = dict()
for summary in tqdm(movies_train.summary):
    summary_embeddings_train.append(embeddings(summary))
summary_embeddings_train = np.array(summary_embeddings_train)

summary_embeddings_test = []
for summary in tqdm(movies_test.summary):
    summary_embeddings_test.append(embeddings(summary))
summary_embeddings_test = np.array(summary_embeddings_test)


In [8]:
import numpy as np
from sklearn.preprocessing import StandardScaler

summary_embeddings_train = np.load('./dataset/mistral_rating_db/summary_embeddings_train.npy')
summary_embeddings_test = np.load('./dataset/mistral_rating_db/summary_embeddings_test.npy')

scaler = StandardScaler()
summary_embeddings_train = scaler.fit_transform(summary_embeddings_train)
summary_embeddings_test = scaler.transform(summary_embeddings_test)

idx2summary = dict()
for idx, row in movies_train.iterrows():
    idx2summary[row.movieid] = summary_embeddings_train[idx]

for idx, row in movies_test.iterrows():
    idx2summary[row.movieid] = summary_embeddings_test[idx]

print(len(idx2summary))

3883


In [9]:
train_df = movies_train.merge(user_ratings, on='movieid', how='inner')
test_df = movies_test.merge(user_ratings, on='movieid', how='inner')

len(train_df), len(test_df)

(817424, 182785)

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import os
import cv2

class ImdbDataset(Dataset):
    def __init__(self, is_train=True):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if is_train:
            self.data = train_df
        else:
            self.data = test_df
        self.idx2summary = idx2summary
        self.user_col = user_col

        with open('dataset/genres.txt', 'r') as f:
            self.genre_all = f.readlines()
            self.genre_all = [x.replace('\n','') for x in self.genre_all]

    def __getitem__(self, index):
        genre = self.data.loc[index, self.genre_all].values.tolist()
        genre = np.array(genre)
        genre_tensor = torch.from_numpy(genre).float()
        genre_tensor = genre_tensor.to(self.device)

        # preprocess text
        summary_tensor = torch.from_numpy(self.idx2summary[self.data.loc[index, 'movieid']]).float()
        summary_tensor = summary_tensor.to(self.device)

        # preprocess image
        img_path = self.data.loc[index, 'img_path']
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
        else:
            img = np.random.rand(256,256,3)
        img = cv2.resize(img, (256,256))
        img_tensor = torch.from_numpy(img.transpose(2,0,1)).float()        
        img_tensor = img_tensor.to(self.device)

        # user embedding
        user = self.data.loc[index, self.user_col].values.tolist()
        user = np.array(user)
        user_tensor = torch.from_numpy(user).float()
        user_tensor = user_tensor.to(self.device)

        return user_tensor, summary_tensor, img_tensor, genre_tensor

    def __len__(self):
        return len(self.data)

In [11]:
from torch.utils.data import RandomSampler
from torch.utils.data import Subset

train_set = ImdbDataset(is_train=True)
sampler_train = RandomSampler(train_set, num_samples=150000)
final_test_set = ImdbDataset(is_train=False)
# choose_idx = np.random.choice(len(final_test_set), 10000, replace=False)
choose_idx = np.load('dataset/mistral_rating_db/choose_idx.npy')
test_set = Subset(final_test_set, choose_idx)

BATCH_SIZE = 50

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, sampler=sampler_train)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
final_test_loader = DataLoader(final_test_set, batch_size=BATCH_SIZE, shuffle=False)
print(len(train_loader), len(test_loader))

4688 313


In [12]:
for user, summary, img, genre in train_loader:
    print(user.shape, summary.shape, img.shape, genre.shape)
    break

with open('./dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
num_classes = len(genre_all)
num_classes

torch.Size([32, 31]) torch.Size([32, 10]) torch.Size([32, 3, 256, 256]) torch.Size([32, 18])


18

In [13]:
from torch import nn

class BaseModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.num_classes = num_classes

        # out = 20
        self.img_model = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),
            nn.Linear(64*64*64, 128),
        )

        self.classifier = nn.Sequential(
            nn.Linear(128+10+31, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, self.num_classes),
        )

    def forward(self, user, summary, img):
        img = self.img_model(img)
        x = torch.cat((user, summary, img), dim=1)
        x = self.classifier(x)
        return x
    
model = BaseModel(num_classes)
model.to(model.device)
model

BaseModel(
  (img_model): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=262144, out_features=128, bias=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=169, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=18, bias=True)
    (5): Sigmoid()
  )
)

In [14]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
from torchmetrics.classification import MultilabelF1Score, MultilabelPrecision, MultilabelRecall

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_labels = 18

def final_test(model):
    model.eval()
    f1_all = 0
    precision_all = 0
    recall_all = 0
    f1 = MultilabelF1Score(num_labels=num_labels, average='macro', threshold=0.5)
    f1 = f1.to(device)
    precision = MultilabelPrecision(num_labels=num_labels, average='macro', threshold=0.5)
    precision = precision.to(device)
    recall = MultilabelRecall(num_labels=num_labels, average='macro', threshold=0.5)
    recall = recall.to(device)

    with torch.no_grad():
        for user, summary, img, genre in tqdm(final_test_loader, total=len(final_test_loader)):
            user, summary, img, genre = user.to(device), summary.to(device), img.to(device), genre.to(device)
            outputs = model(user, summary, img)

            f1_all += f1(outputs, genre)
            precision_all += precision(outputs, genre)
            recall_all += recall(outputs, genre)

    f1_all /= len(final_test_loader)
    precision_all /= len(final_test_loader)
    recall_all /= len(final_test_loader)

    return f1_all, precision_all, recall_all

def test(model):
    model.eval()
    f1_all = 0
    precision_all = 0
    recall_all = 0
    f1 = MultilabelF1Score(num_labels=num_labels, average='macro', threshold=0.5)
    f1 = f1.to(device)
    precision = MultilabelPrecision(num_labels=num_labels, average='macro', threshold=0.5)
    precision = precision.to(device)
    recall = MultilabelRecall(num_labels=num_labels, average='macro', threshold=0.5)
    recall = recall.to(device)

    with torch.no_grad():
        for user, summary, img, genre in tqdm(test_loader, total=len(test_loader)):
            user, summary, img, genre = user.to(device), summary.to(device), img.to(device), genre.to(device)
            outputs = model(user, summary, img)

            f1_all += f1(outputs, genre)
            precision_all += precision(outputs, genre)
            recall_all += recall(outputs, genre)

    f1_all /= len(test_loader)
    precision_all /= len(test_loader)
    recall_all /= len(test_loader)

    return f1_all, precision_all, recall_all

18


In [None]:
from tqdm import tqdm
from torchmetrics.classification import MultilabelF1Score

history = {'train_loss': [], 'test_loss': []}

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for idx, (user, title, img, genre) in tqdm(enumerate(train_loader), total=len(train_loader)):
        user, title, img, genre = user.to(device), title.to(device), img.to(device), genre.to(device)
        optimizer.zero_grad()
        outputs = model(user, title, img)
        loss = criterion(outputs, genre)
        loss.backward()
        optimizer.step()

    train_loss = loss.item()
    f1_all, precision_all, recall_all = test(model)

    history['train_loss'].append(train_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, train_loss: {train_loss:.4f}, f1: {f1_all:.4f}, precision: {precision_all:.4f}, recall: {recall_all:.4f}')

    if epoch == num_epochs-1:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            }, f'./model/model_final.pth')
        f1_final, precision_final, recall_final = final_test(model)
        print(f'f1_final: {f1_all:.4f}, precision_final: {precision_all:.4f}, recall_final: {recall_all:.4f}')
        
    else:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            }, f'./model/model_{epoch+1}.pth')

In [None]:
# train from checkpoint
from tqdm import tqdm
from torchmetrics.classification import MultilabelF1Score

num_labels = genre.shape[1]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

history = {'train_loss': [], 'test_loss': []}

f1 = MultilabelF1Score(num_labels=num_labels, threshold=0.5, average='macro')
f1 = f1.to(device)

num_epochs = 20

model = BaseModel(num_classes)
model.to(model.device)

checkpoint = torch.load('./model/model_20.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
done_epoch = checkpoint['epoch']

for epoch in range(done_epoch+1, num_epochs):
    for idx, (user, title, img, genre) in tqdm(enumerate(train_loader), total=len(train_loader)):
        user, title, img, genre = user.to(device), title.to(device), img.to(device), genre.to(device)
        optimizer.zero_grad()
        outputs = model(user, title, img)
        loss = criterion(outputs, genre)
        loss.backward()
        optimizer.step()

    train_loss = loss.item()
    f1_all, precision_all, recall_all = test(model)

    history['train_loss'].append(train_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, train_loss: {train_loss:.4f}, f1: {f1_all:.4f}, precision: {precision_all:.4f}, recall: {recall_all:.4f}')
        
    if epoch == done_epoch+num_epochs:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            }, f'./model/model_final.pth')
        f1_final, precision_final, recall_final = final_test(model)
        print(f'f1_final: {f1_all:.4f}, precision_final: {precision_all:.4f}, recall_final: {recall_all:.4f}')
        
    else:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            }, f'./model/model_{epoch+1}.pth')

In [3]:
from torchmetrics.classification import MultilabelF1Score, MultilabelPrecision, MultilabelRecall
import torch

f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average=None)

genre = torch.tensor([[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=torch.int)
genre2 = torch.tensor([[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=torch.float32)

test_output = torch.tensor([[  3.4125, -13.5446,  -9.9298, -23.1688,  -8.2869,   3.4333, -29.4170,3.4405, -24.3741, -38.2775,  -7.8598,  -6.5229, -45.0003, -32.6533,3.4697, -25.3200, -23.9719,  -5.3334]])
test_output = torch.sigmoid(test_output)
print(test_output)

f1(genre2, genre)

tensor([[9.6809e-01, 1.3112e-06, 4.8699e-05, 8.6680e-11, 2.5173e-04, 9.6873e-01,
         1.6763e-13, 9.6895e-01, 2.5969e-11, 2.3784e-17, 3.8580e-04, 1.4672e-03,
         2.8617e-20, 6.5895e-15, 9.6981e-01, 1.0085e-11, 3.8827e-11, 4.8044e-03]])
tensor([[0.4001, 0.4520, 0.3636, 0.5605, 0.7180, 0.6099, 0.7752, 0.4662, 0.0762,
         0.6145, 0.6728, 0.3473, 0.4396, 0.4386, 0.8174, 0.6304, 0.5469, 0.7703],
        [0.9111, 0.5869, 0.4164, 0.7474, 0.3386, 0.7987, 0.3209, 0.6175, 0.1389,
         0.2794, 0.4467, 0.3948, 0.4827, 0.3713, 0.5597, 0.6622, 0.3100, 0.3942]])
