# Step 5. Counting probabilities using model

We predict probabilities of our pipeline using model

## 0. Preparation

In [2]:
#import necessary libraries
import pandas as pd
import json
import torch
import numpy as np
from collections import defaultdict
import os
from sklearn.metrics.pairwise import paired_distances
import random
from PIL import Image
from catboost import CatBoostClassifier, Pool, cv
from sklearn.preprocessing import StandardScaler

## 1. Preparation

### 1.1. Dataset class: it is responsible for all operations with dataset

In [3]:
def to_1D(series):
    return [x for _list in series for x in _list]

In [4]:
class Dataset:
    def __init__(self):
        pass
    
    def json_file_opening(self, json_file, img, feature_1, feature_2, 
                          dictionary=False, reshaping=False, tensor=False):
        with open(json_file, 'r') as file:
            img_features = [(json.loads(line)[img],
                 json.loads(line)[feature_1],
                 json.loads(line)[feature_2])
                  for line in file]
        img_list = [feature[0] for feature in img_features]
        feature_1_list = [feature[1] for feature in img_features]
        feature_2_list = [feature[2] for feature in img_features]
        if dictionary == False:
            return img_list, feature_1_list, feature_2_list
        else:
            dict_1_list, dict_2_list = {}, {}
            assert len(img_list) == len(feature_1_list)
            assert len(img_list) == len(feature_2_list)
            for i in range(len(img_list)):
                if reshaping == True:
                    dict_1_list[img_list[i]] = np.array(feature_1_list[i]).reshape(1, -1)
                    dict_2_list[img_list[i]] = np.array(feature_2_list[i]).reshape(1, -1)
                elif tensor == True:
                    dict_1_list[img_list[i]] = torch.FloatTensor(feature_1_list[i]).unsqueeze(dim=0)
                    dict_2_list[img_list[i]] = torch.FloatTensor(feature_2_list[i]).unsqueeze(dim=0)
                else:
                    dict_1_list[img_list[i]] = feature_1_list[i]
                    dict_2_list[img_list[i]] = feature_2_list[i]
            return dict_1_list, dict_2_list
        
    def json_file_opening_one_sample(self, json_file, attr_1, attr_2, dictionary=False, reshaping=False, tensor=False):
        with open(json_file, 'r') as file:
            img_features = [(json.loads(line)[attr_1],
                 json.loads(line)[attr_2])
                  for line in file]
        img_list = [feature[0] for feature in img_features]
        feature_list = [feature[1] for feature in img_features]
        if dictionary == False:
            return img_list, feature_list
        else:
            dict_list = {}
            assert len(img_list) == len(feature_list)
            for i in range(len(img_list)):
                if reshaping == True:
                    dict_list[img_list[i]] = np.array(feature_list[i]).reshape(1, -1)
                elif tensor == True:
                    dict_list[img_list[i]] = torch.FloatTensor(feature_list[i]).unsqueeze(dim=0)
                else:
                    dict_list[img_list[i]] = feature_list[i]
            return dict_list

### 1.2. Analytics class: it is responsible for all analytics

In [5]:
class Analytics:
    def __init__(self):
        pass
    
    def candidates(self, dataframe, embedding, metric, n):
        dataframe = dataframe.drop(dataframe.loc[dataframe['image'] == dataframe['candidate']].index)
        dataframe = dataframe.drop(dataframe.loc[dataframe[f'{embedding}_{metric}'] == 1].index)
        nearest_df = dataframe.reset_index(drop=True)
        if metric == 'cossim':
            grouping = nearest_df.groupby(['image'])[f'{embedding}_{metric}'].nlargest(n)
        elif metric == 'l2':
            grouping = nearest_df.groupby(['image'])[f'{embedding}_{metric}'].nsmallest(n)
        grouping_dict = grouping.to_dict()
        paired_tuples = [(key[0], nearest_df.iloc[key[1]]['candidate']) for key in grouping_dict.keys()]
        candidates_dict = defaultdict(list)
        for img, cand in paired_tuples:
            candidates_dict[img].append(cand)
        with open(f'image_candidates_{n}_{metric}.json', 'w') as file:
            pass
        for key, value in candidates_dict.items():
            json_dict = {'img': key, 'candidates': value}
            with open(f'image_candidates_{n}_{metric}.json', 'a') as candidates:
                json.dump(json_dict, candidates)
                candidates.write('\n')
        return candidates_dict
    
    def candidates_model(self, dataframe, n):
        nearest_df = dataframe.reset_index(drop=True)
        grouping = nearest_df.groupby(['image'])['pred_1'].nlargest(n)
        grouping_dict = grouping.to_dict()
        paired_tuples = [(key[0], nearest_df.iloc[key[1]]['candidate']) for key in grouping_dict.keys()]
        candidates_dict = defaultdict(list)
        for img, cand in paired_tuples:
            candidates_dict[img].append(cand)
        with open(f'image_model_candidates_{n}.json', 'w') as file:
            pass
        for key, value in candidates_dict.items():
            json_dict = {'img': key, 'candidates': value}
            with open(f'image_model_candidates_{n}.json', 'a') as candidates:
                json.dump(json_dict, candidates)
                candidates.write('\n')
        return candidates_dict

## 2. Dataset preparation

We load csv file with paired images for clusters

In [7]:
data = pd.read_csv('paired_images_texts_clusters.csv')

We substitute CLIP and BERT vectors instead images files

In [10]:
dataset = Dataset()
img_clip, img_bert = dataset.json_file_opening('clip_bert.json', 
                       'img', 'clip', 'bert',
                       dictionary=True, tensor=True, reshaping=False)
data['img_clip_1'] = data['image'].apply(lambda x: img_clip.get(x))
data['img_clip_2'] = data['candidate'].apply(lambda x: img_clip.get(x))
data['img_bert_1'] = data['image'].apply(lambda x: img_bert.get(x))
data['img_bert_2'] = data['candidate'].apply(lambda x: img_bert.get(x))

We count metrics: cosine similarity and L2 distance for CLIP and BERT vectors

In [None]:
analytics = Analytics()
data = analytics.metrics_count(data)

We reshape CLIP and BERT vectors for 1D array

In [None]:
data['img_clip_1'] = data['img_clip_1'].apply(lambda x: x.numpy().reshape(512))
data['img_clip_2'] = data['img_clip_2'].apply(lambda x: x.numpy().reshape(512))
data['img_bert_1'] = data['img_bert_1'].apply(lambda x: x.numpy().reshape(768))
data['img_bert_2'] = data['img_bert_2'].apply(lambda x: x.numpy().reshape(768))

We implement two dataframes from one:
1. data_images - for searching candidates for images
2. data_pipe - to predict probabilitiesof target using CatBoost model

In [21]:
data_images = data[['image', 'candidate', 'clip_cossim', 'clip_l2', 'bert_cossim', 'bert_l2']]
data_pipe = data[['img_clip_1', 'img_clip_2', 'img_bert_1', 'img_bert_2', 'clip_cossim', 'clip_l2', 'bert_cossim', 'bert_l2']]
sc = StandardScaler()
scaled_features = ['clip_cossim', 'clip_l2', 'bert_cossim', 'bert_l2']
data_pipe[scaled_features] = sc.fit_transform(data_pipe[scaled_features])

## 3. Prediction probabilities of target

We predict probabilities of target

In [23]:
catboost = CatBoostClassifier()
catboost.load_model('catboost_model.bin')
prediction_proba = catboost.predict_proba(data_pipe)

<catboost.core.CatBoostClassifier at 0x3ec7b9190>

We create two columns for P(y=0) and P(y=1)

In [None]:
pred_0_proba = [pred[0] for pred in prediction_proba]
pred_1_proba = [pred[1] for pred in prediction_proba]
data_images['pred_0'] = pred_0_proba
data_images['pred_1'] = pred_1_proba

## 4. Find 5 candidates for model, CLIP and BERT vectors

In [None]:
candidates_model = analytics.candidates_model(data_images, 5)
candidates_clip = analytics.candidates(data_images, 'clip', 'cossim', 5)
candidates_bert = analytics.candidates(data_images, 'bert', 'cossim', 5)