In [None]:
from flask import Flask
from flask import request

import requests

from io import BytesIO
import json

from PIL import Image
import pytesseract
from Levenshtein import distance

import os
import numpy as np
from sklearn.cluster import DBSCAN

import matplotlib.patches as patches

In [None]:
from flask_sqlalchemy import SQLAlchemy
from flask import Flask

from sqlalchemy.ext.declarative import DeclarativeMeta
import json

class AlchemyEncoder(json.JSONEncoder):

    def default(self, obj):
        if isinstance(obj.__class__, DeclarativeMeta):
            # an SQLAlchemy class
            fields = {}
            for field in [x for x in dir(obj) if not x.startswith('_') and x != 'metadata']:
                data = obj.__getattribute__(field)
                try:
                    json.dumps(data) # this will fail on non-encodable values, like other classes
                    fields[field] = data
                except TypeError:
                    fields[field] = None
            # a json-encodable dict
            return fields

        return json.JSONEncoder.default(self, obj)
    

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///for_docker/api/test_certs.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)

class Сertificate(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(db.Integer)
    image_url = db.Column(db.String(255))
    text_from_image = db.Column(db.Text)
    bbs = db.Column(db.JSON)
    text_blocks = db.Column(db.JSON)
    user_id = db.Column(db.Integer)
    post_id = db.Column(db.Integer)
    session_id = db.Column(db.Integer, default=-1)

    __fulltext_columns__ = ('text_from_image')


class Session_has_certs(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    id_session = db.Column(db.Integer)
    id_certificate = db.Column(db.Integer)
    __table_args__ = (db.UniqueConstraint('id_session', 'id_certificate', name='_session_certificate_uc'),)


class Cluster(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    bbs = db.Column(db.JSON)
    cluster_name = db.Column(db.String(100))


class Session(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    session_name = db.Column(db.String(100))
    status = db.Column(db.String(100))
    data = db.Column(db.JSON)


class Users(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    token = db.Column(db.String(200))
    vk_id = db.Column(db.String(100))
    
db.create_all()

In [None]:
from sqlalchemy_fulltext import FullText, FullTextSearch

In [None]:
user_ids = [file_name.split('.')[0] for file_name in os.listdir('saved_data')]

In [None]:
print(Сertificate.query.all())
print(Cluster.query.all())
print(Session.query.all())

In [None]:
items = Сertificate.query.all()
json.loads(json.dumps(items, cls=AlchemyEncoder))[0]['text_from_image']

In [None]:
import flask_whooshalchemy
Сertificate.query.filter(FullTextSearch('text_from_image', Сertificate)).all()

In [None]:
import requests
import json

headers = {'Content-type': 'application/json',
            'Accept': 'text/plain',
            'Content-Encoding': 'utf-8'}
data = {'ids': user_ids[:10], 'session_name': 'session_server_test_01'}

requests.post('http://80.89.204.142:14289/api/analyze', data=json.dumps(data), headers=headers).text#.json()

In [None]:
requests.get('http://80.89.204.142:14289/api/get_status').json()

In [None]:
clusters = requests.get('http://80.89.204.142:14289/api/get_all_clusters').json()['response']

In [None]:
import matplotlib.pylab as plt
from PIL import Image
from io import BytesIO

def download_image(image_url):
    res = requests.get(image_url)
    image = Image.open(BytesIO(res.content))
    return image

cluster_ind = 0

max_posts_in_cluster = max([len(current_cluster['certificates']) for current_cluster in clusters])

fig, axis = plt.subplots(len(clusters), max_posts_in_cluster, figsize=(10, 10*((len(clusters)//max_posts_in_cluster)+1)))

for i in range(len(clusters)):
    for j in range(len(clusters[i]['certificates'])):
        img_url = clusters[i]['certificates'][j]['image_url']
        img = download_image(img_url)
        
        axis[i][j].imshow(img)
        
plt.show()

In [None]:
import json

with open('test_res.json') as json_file:
    data = json.load(json_file)

def writeWordToDict(word, my_dict, id, cluster_id):
    _end = '_end_'
    root = my_dict
    #print(root)
    for letter in word.lower():
        my_dict = my_dict.setdefault(letter, {})
    #print(dict)
    value = {"certificate_id": id, "cluster_id": cluster_id}
    if my_dict.get(_end) is None:
        my_dict[_end] = [] 
    if not(value in my_dict[_end]):
        my_dict[_end].append(value)    
    return root

def search(tree, word):
    _end = '_end_'
    current_dict = tree
    for letter in word.lower():
        if letter not in current_dict:
            return False
        current_dict = current_dict[letter]
    return current_dict[_end]

def prepare(clusters):
    tree = {}
    for cluster in clusters:
        certificates = cluster.get('certificates')
        cluster_id = cluster.get('cluster_id')
        for item in certificates:
            id = item.get('id')
            bbs = item.get('bbs')
            for bb in bbs:
                text = bb.get('text')
                tree = writeWordToDict(text, tree, id, cluster_id)
    return tree


tree = prepare(data)
search(trie, 'обучение')

In [None]:
my_dict = {}
root = my_dict
for letter in 'абв'.lower():
    my_dict = my_dict.setdefault(letter, {})

for letter in 'абг'.lower():
    my_dict = my_dict.setdefault(letter, {})

root

In [None]:
my_dict = {'a': 1}

my_dict.setdefault('a', {})

In [None]:
def id_from_queue():
    items = Session.query.filter_by(status='in_queue').first()
    try:
        session_id = json.loads(json.dumps(items, cls=AlchemyEncoder))['id']
        ids = json.loads(json.dumps(items, cls=AlchemyEncoder))['data']

        return {'session_id': session_id, 'ids': ids}
    except Exception as e:
        return None

def update_status(session_id, new_status):
    admin = Session.query.filter_by(id=session_id).first()
    admin.status = new_status
    db.session.commit()

update_status(1, 'in_queue')
id_from_queue()

In [None]:
items = Session.query.all()
json.loads(json.dumps(items, cls=AlchemyEncoder))

In [None]:
items = Session.query.all()
session_id = json.loads(json.dumps(items, cls=AlchemyEncoder))[0]['id']

Session.query.filter_by(id=session_id).first()

In [None]:
def vk_api(method, data):
    access_token = '25527794e79a323559f47c29b1df2c3b6f1eb91d1f818a6c02867d4bf12c57fb7a8e3dc6830bc046ba482'
    version = '5.37'

    link = "https://api.vk.com/method/{}?access_token={}&v={}".format(method, access_token, version)
    for key, val in data.items():
        link += '&{}={}'.format(key, str(val))
        
    res = requests.get(link)
    return res.json()

In [None]:
print(vk_api('users.get', {'user_ids': 'lyangasov_ivan'})['response'][0]['id'])
print(vk_api('groups.getById', {'group_id': 'tproger'})['response'][0]['id'])

print(vk_api('users.get', {'user_ids': 'tproger'})['error']['error_msg'])
print(vk_api('groups.getById', {'group_id': 'lyangasov_ivan'})['error']['error_msg'])

In [None]:
'tt'.split('/')[-1]

In [None]:
# находим id всех пользователей

def get_all_members(group_id):
    
    if '/' in group_id:
        group_id = group_id.split('/')[-1]
        
    members = []
    old_members = [0] * 1000
    i = 0

    while len(old_members) == 1000:
        data = {'group_id': group_id,
                'offset': i * 1000,
                'count': 1000}
        old_members = vk_api('groups.getMembers', data)['response']['items']
        i += 1
        members += old_members

    return members

users_ids = get_all_members('pumptraffic')

In [None]:
users_ids = [file_name.split('.')[0] for file_name in os.listdir('saved_data')]
len(users_ids)

In [None]:
my_ids = users_ids[8:10]
res = requests.post('http://0.0.0.0:81/analyze', json={'ids': my_ids, 'session_name': 'тест_3'})
res.text

In [None]:
res = requests.get('http://0.0.0.0:81/get_all_clusters')
print(len(res.json()))

'''for cluster in res.json():
    print(len(cluster['certificates']))'''

In [None]:
Cluster.query.all()

In [None]:
%%time
# находим все сертификаты одного пользователя

target_words = ['диплом', 
                'сертификат', 
                'лицензия', 
                'certified', 
                'specialist', 
                'специалист', 
                'эксперт']

def download_image(image_url):
    res = requests.get(image_url)
    image = Image.open(BytesIO(res.content))
    return image

def images_from_res(res):
    images_paths = []
    for index, item in enumerate(res):
        try:
            attachments = item['attachments']

            for attachment in attachments:
                versions_of_photo = []
                if attachment['type'] == 'photo':
                    photo = attachment['photo']
                    for key, path in photo.items():
                        if 'photo' in key:
                            versions_of_photo.append({'size': int(key.replace('photo_', '')), 'path': path})

                # находим изображение с самым большим разрешением
                versions_of_photo = sorted(versions_of_photo, key=lambda k: k['size'], reverse=True)
                best_photo_path = versions_of_photo[0]['path']

                #post_url = 'https://vk.com/id{}?w=wall{}_{}'.format(item['from_id'], item['from_id'], item['id'])
                images_paths.append({'post_id': item['id'], 
                                     'user_id': item['from_id'], 
                                     'image_url': best_photo_path})
        except Exception as e:
            pass

    return images_paths



def find_in_text(text, target_words=['сертификат']):
    for target_word in target_words:
        for line in text.split('\n'):
            for word in line.split(' '):
                if len(word) > 5:
                    word = word.lower()
                    dist = distance(word, target_word)
                    if dist < 4:
                        return True
    return False


def get_cert_bbs(image):
    data = pytesseract.image_to_data(image, lang='rus').split('\n')
    all_rows = [row.split('\t') for row in data]

    columt_names = all_rows[0]
    data_rows = all_rows[1:]

    textes_data = [dict(zip(columt_names, row)) for row in data_rows]

    bbs = []
    for text_data in textes_data:
        try:
            if len(text_data['text'].replace(' ', '')) > 2:
                img_w = image.size[0]
                img_h = image.size[1]
                bbs.append({'text': text_data['text'],
                               'y': int(text_data['top'])/img_h, 
                               'x': int(text_data['left'])/img_w, 
                               'w': int(text_data['width'])/img_w, 
                               'h':int(text_data['height'])/img_h})
        except Exception as e:
            pass
        
    return bbs


def analyze_user(user_id, count=30, offset=0):
    certificates_data = []
    
    data = {'filter': 'owner',
            'extended': '0',
            'owner_id': user_id, 
            'count': count, 
            'offset': offset}
    res_posts = vk_api('wall.get', data)

    try:
        images_data = images_from_res(res_posts['response']['items'])

        for image_data in images_data:
            image_url = image_data['image_url']
            post_id = image_data['post_id']
            user_id = image_data['user_id']

            image = download_image(image_url)

            text_from_img = pytesseract.image_to_string(image, lang='rus+eng')

            if find_in_text(text_from_img, target_words=target_words):
                bbs = get_cert_bbs(image)

                image_data.update({'text_from_image': text_from_img, 'bbs': bbs})
                #save_cert(image_data)
                
                certificates_data.append(image_data)

    except Exception as e:

        try:
            error_msg = res_posts['error']['error_msg']
            error_code = res_posts['error']['error_code']

        except Exception:
            error_msg = str(e)
            error_code = None

        print('error! user_id: {} msg: {}'.format(user_id, error_msg))
        if error_code == 29:  # если достигли лимита запросов в день
            raise ValueError(error_msg)

        return []

    return certificates_data


certs_of_users = []
for index, user_id in enumerate(users_ids):
    clear_output()
    print(index)
    certs_of_user = analyze_user(user_id)
    certs_of_users.append(certs_of_user)

In [None]:
all_certs = []
for certs_of_user in certs_of_users:
    for cert in certs_of_user:
        cert.update({'image': download_image(cert['image_url'])})
        all_certs.append(cert)

In [None]:
with open('all_users_data.txt', 'w') as f:
    f.write(json.dumps(all_certs))

In [None]:
from flask_sqlalchemy import SQLAlchemy
from flask import Flask

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///for_docker/api/db/test_certs.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)

class Сertificate(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(db.Integer)
    image_url = db.Column(db.String(255))
    text_from_image = db.Column(db.Text)
    bbs = db.Column(db.JSON)
    user_id = db.Column(db.Integer)
    post_id = db.Column(db.Integer)
    session_id = db.Column(db.Integer, default=-1)


class Session_has_certs(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    id_session = db.Column(db.Integer)
    id_certificate = db.Column(db.Integer)
    __table_args__ = (db.UniqueConstraint('id_session', 'id_certificate', name='_session_certificate_uc'),)


class Cluster(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    bbs = db.Column(db.JSON)
    cluster_name = db.Column(db.String(100))


class Session(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    session_name = db.Column(db.String(100))
    status = db.Column(db.String(100))


class Users(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    token = db.Column(db.String(200))
    vk_id = db.Column(db.String(100))
    
#db.create_all()

In [None]:

certificate = Сertificate(cluster_id = 7,
                          image_url = 'aisajidf',
                          text_from_image = 'same text',
                          bbs = {'1': 333},
                          user_id = 777,
                          post_id = 111)

try:
    db.session.add(certificate)
    db.session.commit()
    db.session.rollback()
except Exception as e:
    print(str(e))

In [None]:
Сertificate.query.all()

In [None]:
%%time
# кластеризуем все изображения и запишем результат в БД

def diff_iou(bb1, bb2):
    """
    bb : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    """

    if (bb1 == None) or (bb2 == None):
        return 0.0
    # determine the coordinates of the intersection rectangle
    bb1 = {'x1': bb1['x'], 'x2': bb1['x'] + bb1['w'],'y1': bb1['y'],'y2': bb1['y'] + bb1['h'], 'text': bb1['text']}
    bb2 = {'x1': bb2['x'], 'x2': bb2['x'] + bb2['w'],'y1': bb2['y'],'y2': bb2['y'] + bb2['h'], 'text': bb2['text']}
    
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    l1 = len(bb1['text'])
    l2 = len(bb2['text'])
    levin_dist = 1 - distance(bb1['text'], bb2['text']) / (l1 + l2)
    iou = iou * levin_dist
    
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

def find_longer_dist(matrix):
    """
        функция для нахождения пути (в матрице) при котором сумма всех нод данного пути будет максимальна
    """
    
    # идем горизонтальными полосами
    non_zero_indexes = matrix[0].nonzero()[0]
    
    if matrix.shape[0] > 1:
        if len(non_zero_indexes) == 0:
            return find_longer_dist(matrix[1:])
        else:
            lens = []
            for index in non_zero_indexes:
                len_through_index = find_longer_dist(matrix[1:])
                lens.append(len_through_index)
            return matrix[0][index] + max(lens)
            
    else:
        if len(non_zero_indexes) == 0:
            # если в последнем слое только нули - возвращаем текущую длинну пути
            return 0
        else:
            # ищем максимальную длинну в последнем слое
            lens = []
            for index in non_zero_indexes:
                len_through_index = matrix[0][index]
                lens.append(len_through_index)
            return max(lens)

def diff_bbs(bbs1, bbs2):
    """
        0 - похожи, 1 - не похожи
    """
    matrix = np.zeros((len(bbs1), len(bbs2)))
    
    for i, bb1 in enumerate(bbs1):
        for j, bb2 in enumerate(bbs2):
            matrix[i][j] = diff_iou(bb1, bb2)
    
    similarity_rows = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity_columnes = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity = max(similarity_rows, similarity_columnes)
    return similarity

def load_clusters():
    """
        загрузить из БД все кластеры (их id и bbs)
    """
    items = Cluster.query.all() # .order_by(Item.user_id)

    clusters = []
    for cluster in json.loads(json.dumps(items, cls=AlchemyEncoder)):
        bbs = cluster['bbs']
        id = cluster['id']
        clusters.append({'id': id, 'bbs': bbs})

    return clusters
    

def create_cluster(bbs):
    """
        создать в БД новый кластер
    """
    
    cluster = Cluster(bbs=bbs)
    try:
        db.session.add(cluster)
        db.session.commit()
        #db.session.close()
    except Exception as e:
        db.session.rollback()
        print(str(e))
        
    return cluster.id
        
def add_certificate(cert, session_id=-1):
    """
        добавить сертификат в БД
    """
    certificate = Сertificate.query.filter_by(image_url = cert['image_url']).first()
    if certificate is None:
        certificate = Сertificate(cluster_id = cert['cluster_id'],
                                  image_url = cert['image_url'],
                                  text_from_image = cert['text_from_image'],
                                  bbs = cert['bbs'],
                                  user_id = cert['user_id'],
                                  post_id = cert['post_id'])

        try:
            db.session.add(certificate)
            db.session.commit()
            db.session.rollback()
        except Exception as e:
            print(str(e))
            
    cert_id = certificate.id
    session_has_cert = Session_has_certs(id_session=session_id, id_certificate=cert_id)
    try:
        db.session.add(session_has_cert)
        db.session.commit()
        #db.session.close()
    except Exception as e:
        db.session.rollback()
        print(str(e))
        
    return cert_id
    
def update_cluster_centroids(bbs):
    """
        обновить значение "среднего" bbs у кластера
    """
    return 0

def clusterize(cert_data):
    cert_bbs = cert_data['bbs'][:20]
    #text = cert_data['text_from_image']
    
    clusters = load_clusters() # [{'id': 2, 'bbs': []}, {...}, {...}]
    
    top_cluster_id = None
    top_similar = -1.
    for cluster in clusters:
        cluster_bbs = cluster['bbs'][:20]
        cluster_id = cluster['id']
        
        similarity = diff_bbs(cluster_bbs, cert_bbs)
        
        #print('{} => {}'.format(cluster['id'], similarity))
        if (similarity < 0.8) and (similarity > top_similar):
            top_similar = similarity
            top_cluster_id = cluster_id
    
    if top_cluster_id is None:
        top_cluster_id = create_cluster(bbs=cert_bbs)
    else:
        update_cluster_centroids(bbs=cert_bbs)
        
    return top_cluster_id



for i, cert_data in enumerate(all_certs):
    try:
        cluster_id = clusterize(cert_data)
        cert_data.update({'cluster_id': cluster_id})
        add_certificate(cert_data)
    except Exception as e:
        # если новых данных нет
        pass #print(str(e))
    

items = Cluster.query.all() # .order_by(Item.user_id)
print('cnt of Cluster = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

items = Session_has_certs.query.all() # .order_by(Item.user_id)
print('cnt of Session_has_certs = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

items = Сertificate.query.all() # .order_by(Item.user_id)
print('cnt of Сertificate = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

In [None]:
from analyze_script import *
import os
from IPython.display import clear_output

#users_ids = get_all_members('pumptraffic')
users_ids = [file_name.split('.')[0] for file_name in os.listdir('saved_data')]
print('users len = ', len(users_ids))
print('users loaded')

certs_of_users = []
for index, user_id in enumerate(users_ids):
    
        clear_output()
        print('user ind', index)
        certs_of_user = analyze_user(user_id, target_words=['диплом', 
                                                            'сертификат', 
                                                            'лицензия', 
                                                            'certified', 
                                                            'specialist', 
                                                            'специалист', 
                                                            'эксперт'])

        for cert_data in certs_of_user:
            try:
                cluster_id = clusterize(cert_data)
                cert_data.update({'cluster_id': cluster_id})
                add_certificate(cert_data)

            except Exception as e:
                # если новых данных нет
                print(str(e))
    
    

items = Cluster.query.all() # .order_by(Item.user_id)
print('cnt of Cluster = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

items = Session_has_certs.query.all() # .order_by(Item.user_id)
print('cnt of Session_has_certs = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

items = Сertificate.query.all() # .order_by(Item.user_id)
print('cnt of Сertificate = {}'.format(len(json.loads(json.dumps(items, cls=AlchemyEncoder)))))

In [None]:
items = Session_has_certs.query.all() # .order_by(Item.user_id)
{id_cert: id_json.loads(json.dumps(items, cls=AlchemyEncoder))}

In [None]:
#user_alias = aliased(User, name='user2')
def get_certs_by_cluster(id):
    items = Сertificate.query.filter_by(cluster_id = id).all()
    certs = json.loads(json.dumps(items, cls=AlchemyEncoder))
    return certs

len(get_certs_by_cluster(40))

In [None]:
items = Сertificate.query.all()
certs = json.loads(json.dumps(items, cls=AlchemyEncoder))

len(certs)

In [None]:
items = Cluster.query.all()
clusters = json.loads(json.dumps(items, cls=AlchemyEncoder))
len(clusters)

In [None]:
clusters[0]['id']

In [None]:
all_clusters = []
for cluster in clusters:
    cluster_id = cluster['id']
    cluster_name = 'кластер #{}'.format(cluster_id) # cluster['cluster_name']
    certs = get_certs_by_cluster(cluster_id)
    
    current_cluster = []
    for cert in certs:
        current_cluster.append({
                          'id': cert['id'], 
                          'bbs': cert['bbs'], 
                          'image_url': cert['image_url'], 
                          'post_id': cert['post_id'], 
                          'user_id': cert['user_id'],
                         })
        
    all_clusters.append({'cluster_id': cluster_id, 'cluster_name': cluster_name, 'certificates': current_cluster})

In [None]:
response = []
for i in range (1, 50):
    certs = get_certs_by_cluster(i)
    current_cluster = []
    
    for cert in certs:
        current_cluster.append({'id': cert['id'], 
                                'bbs': cert['bbs'], 
                                'image_url': cert['image_url'], 
                                'post_id': cert['post_id'], 
                                'user_id': cert['user_id']
                               })
    if len(current_cluster) > 1:
        response.append({'cluster_id': i, 'cluster_name': 'кластер #{}'.format(i), 'certificates': current_cluster})
        
    '''if len(response) == 14:
        break'''
        
len(response)

In [None]:
with open('test_res.json', 'w') as f:
    f.write(json.dumps(response))

In [None]:
import matplotlib.pylab as plt
def download_image(image_url):
    res = requests.get(image_url)
    image = Image.open(BytesIO(res.content))
    return image
cluster_ind = 0

max_posts_in_cluster = max([len(current_cluster['certificates']) for current_cluster in response])

fig, axis = plt.subplots(len(response), max_posts_in_cluster, figsize=(10, 10*((len(response)//max_posts_in_cluster)+1)))

for i in range(len(response)):
    for j in range(len(response[i]['certificates'])):
        img_url = response[i]['certificates'][j]['image_url']
        img = download_image(img_url)
        
        axis[i][j].imshow(img)
        
plt.show()

In [None]:
Сertificate.query.filter_by(cluster_id = 4).first().from_id_session

In [None]:
def diff_bbs(bbs1, bbs2):
    """
        0 - похожи, 1 - не похожи
    """
    matrix = np.zeros((len(bbs1), len(bbs2)))
    
    for i, bb1 in enumerate(bbs1):
        for j, bb2 in enumerate(bbs2):
            matrix[i][j] = diff_iou(bb1, bb2)
    
    similarity_rows = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity_columnes = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity = max(similarity_rows, similarity_columnes)
    return similarity, matrix

bbs1 = all_certs[0]['bbs']
bbs2 = all_certs[2]['bbs']

diff_bbs(bbs1, bbs2)

In [None]:
from collections import defaultdict

def load_images_data():
    items = Сertificate.query.all() # .order_by(Item.user_id)

    certificates = defaultdict(list)
    for cert_data in all_certs_data: #json.loads(json.dumps(items, cls=AlchemyEncoder)):
        image = cert_data['image']
        cluster_id = cert_data['cluster_id']
        
        certificates[cluster_id].append(image)
    
    return dict(certificates)

clusters = load_images_data()

max_posts_in_cluster = max([len(cluster_posts) for _, cluster_posts in clusters.items()])
print(len(clusters))
print(max_posts_in_cluster)
fig, axis = plt.subplots(len(clusters), max_posts_in_cluster, figsize=(10, 10*((len(clusters)//max_posts_in_cluster)+1)))

for cluster_id, (_, cluster_posts) in enumerate(clusters.items()):
    for index_post, cluster_post in enumerate(cluster_posts):
        image = cluster_post # image cert_img
        if image == None:
            continue

        axis[cluster_id][index_post].imshow(image)
        #axis[cluster_id][index_post].annotate(cluster_post['post_url'].split('wall')[1], xy=(0.5, 10), xytext=(0, 10))
        
        #cert_coords = cluster_post['cert_coords']
        #rect = patches.Rectangle((coords['x']*image.size[0],coords['y'])*image.size[1],coords['w']*image.size[0],coords['h']*image.size[1],linewidth=1,edgecolor='r',facecolor='none')
        #axis[cluster_id][index_post].add_patch(rect)
        
plt.savefig('clusters_of_users.png')
#plt.show()