In [None]:
from flask import Flask
from flask import request

import requests

from io import BytesIO
import json

from PIL import Image
import pytesseract
from Levenshtein import distance

import os
import numpy as np
from sklearn.cluster import DBSCAN

import matplotlib.patches as patches

In [None]:
from flask_sqlalchemy import SQLAlchemy
from flask import Flask

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///test_certs.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)

class Сertificate(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(db.Integer)
    image_url = db.Column(db.String(255))
    text_from_image = db.Column(db.Text)
    bbs = db.Column(db.JSON)
    user_id = db.Column(db.Integer)
    post_id = db.Column(db.Integer)
    from_id_session = db.Column(db.Integer)
    
class Cluster(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    bbs = db.Column(db.JSON)
    
class Session(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    session_name = db.Column(db.String(100))
    status = db.Column(db.String(100))
    
class Users(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    token = db.Column(db.String(200))
    vk_id = db.Column(db.String(100))
    
db.create_all()

In [None]:
from sqlalchemy.ext.declarative import DeclarativeMeta
import json

class AlchemyEncoder(json.JSONEncoder):

    def default(self, obj):
        if isinstance(obj.__class__, DeclarativeMeta):
            # an SQLAlchemy class
            fields = {}
            for field in [x for x in dir(obj) if not x.startswith('_') and x != 'metadata']:
                data = obj.__getattribute__(field)
                try:
                    json.dumps(data) # this will fail on non-encodable values, like other classes
                    fields[field] = data
                except TypeError:
                    fields[field] = None
            # a json-encodable dict
            return fields

        return json.JSONEncoder.default(self, obj)
    


In [None]:
def vk_api(method, data):
    access_token = '25527794e79a323559f47c29b1df2c3b6f1eb91d1f818a6c02867d4bf12c57fb7a8e3dc6830bc046ba482'
    version = '5.37'

    link = "https://api.vk.com/method/{}?access_token={}&v={}".format(method, access_token, version)
    for key, val in data.items():
        link += '&{}={}'.format(key, str(val))
        
    res = requests.get(link)
    return res.json()

In [None]:
# находим id всех пользователей

def get_all_members(group_id):
    
    if '/' in group_id:
        group_id = group_id.split('/')[-1]
        
    members = []
    old_members = [0] * 1000
    i = 0

    while len(old_members) == 1000:
        data = {'group_id': group_id,
                'offset': i * 1000,
                'count': 1000}
        old_members = vk_api('groups.getMembers', data)['response']['items']
        i += 1
        members += old_members

    return members

users_ids = get_all_members('pumptraffic')

In [None]:
users_ids = [file_name.split('.')[0] for file_name in os.listdir('saved_data')]

In [None]:
%%time
# находим все сертификаты одного пользователя

target_words = ['диплом', 
                'сертификат', 
                'лицензия', 
                'certified', 
                'specialist', 
                'специалист', 
                'эксперт']

def download_image(image_url):
    res = requests.get(image_url)
    image = Image.open(BytesIO(res.content))
    return image

def images_from_res(res):
    images_paths = []
    for index, item in enumerate(res):
        try:
            attachments = item['attachments']

            for attachment in attachments:
                versions_of_photo = []
                if attachment['type'] == 'photo':
                    photo = attachment['photo']
                    for key, path in photo.items():
                        if 'photo' in key:
                            versions_of_photo.append({'size': int(key.replace('photo_', '')), 'path': path})

                # находим изображение с самым большим разрешением
                versions_of_photo = sorted(versions_of_photo, key=lambda k: k['size'], reverse=True)
                best_photo_path = versions_of_photo[0]['path']

                #post_url = 'https://vk.com/id{}?w=wall{}_{}'.format(item['from_id'], item['from_id'], item['id'])
                images_paths.append({'post_id': item['id'], 
                                     'user_id': item['from_id'], 
                                     'image_url': best_photo_path})
        except Exception as e:
            pass

    return images_paths



def find_in_text(text, target_words=['сертификат']):
    for target_word in target_words:
        for line in text.split('\n'):
            for word in line.split(' '):
                if len(word) > 5:
                    word = word.lower()
                    dist = distance(word, target_word)
                    if dist < 4:
                        return True
    return False


def get_cert_bbs(image):
    data = pytesseract.image_to_data(image, lang='rus').split('\n')
    all_rows = [row.split('\t') for row in data]

    columt_names = all_rows[0]
    data_rows = all_rows[1:]

    textes_data = [dict(zip(columt_names, row)) for row in data_rows]

    bbs = []
    for text_data in textes_data:
        try:
            if len(text_data['text'].replace(' ', '')) > 2:
                img_w = image.size[0]
                img_h = image.size[1]
                bbs.append({'text': text_data['text'],
                               'y': int(text_data['top'])/img_h, 
                               'x': int(text_data['left'])/img_w, 
                               'w': int(text_data['width'])/img_w, 
                               'h':int(text_data['height'])/img_h})
        except Exception as e:
            pass
        
    return bbs


def analyze_user(user_id, count=30, offset=0):
    certificates_data = []
    
    data = {'filter': 'owner',
            'extended': '0',
            'owner_id': user_id, 
            'count': count, 
            'offset': offset}
    res_posts = vk_api('wall.get', data)

    try:
        images_data = images_from_res(res_posts['response']['items'])

        for image_data in images_data:
            image_url = image_data['image_url']
            post_id = image_data['post_id']
            user_id = image_data['user_id']

            image = download_image(image_url)

            text_from_img = pytesseract.image_to_string(image, lang='rus+eng')

            if find_in_text(text_from_img, target_words=target_words):
                bbs = get_cert_bbs(image)

                image_data.update({'text_from_image': text_from_img, 'bbs': bbs})
                #save_cert(image_data)
                
                certificates_data.append(image_data)

    except Exception as e:

        try:
            error_msg = res_posts['error']['error_msg']
            error_code = res_posts['error']['error_code']

        except Exception:
            error_msg = str(e)
            error_code = None

        print('error! user_id: {} msg: {}'.format(user_id, error_msg))
        if error_code == 29:  # если достигли лимита запросов в день
            raise ValueError(error_msg)

        return []

    return certificates_data

certs_of_users = []
for index, user_id in enumerate(users_ids):
    clear_output()
    print(index)
    certs_of_user = analyze_user(user_id)
    certs_of_users.append(certs_of_user)

In [None]:
all_certs = []
for certs_of_user in certs_of_users:
    for cert in certs_of_user:
        cert.update({'image': download_image(cert['image_url'])})
        all_certs.append(cert)

In [None]:
with open('all_users_data.txt', 'w') as f:
    f.write(json.dumps(all_certs))

In [None]:
%%time
# кластеризуем все изображения

def diff_iou(bb1, bb2):
    """
    bb : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    """

    if (bb1 == None) or (bb2 == None):
        return 0.0
    # determine the coordinates of the intersection rectangle
    bb1 = {'x1': bb1['x'], 'x2': bb1['x'] + bb1['w'],'y1': bb1['y'],'y2': bb1['y'] + bb1['h'], 'text': bb1['text']}
    bb2 = {'x1': bb2['x'], 'x2': bb2['x'] + bb2['w'],'y1': bb2['y'],'y2': bb2['y'] + bb2['h'], 'text': bb2['text']}
    
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    l1 = len(bb1['text'])
    l2 = len(bb2['text'])
    levin_dist = 1 - distance(bb1['text'], bb2['text']) / (l1 + l2)
    iou = iou * levin_dist
    
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

def find_longer_dist(matrix):
    """
        функция для нахождения пути (в матрице) при котором сумма всех нод данного пути будет максимальна
    """
    
    # идем горизонтальными полосами
    non_zero_indexes = matrix[0].nonzero()[0]
    
    if matrix.shape[0] > 1:
        if len(non_zero_indexes) == 0:
            return find_longer_dist(matrix[1:])
        else:
            lens = []
            for index in non_zero_indexes:
                len_through_index = find_longer_dist(matrix[1:])
                lens.append(len_through_index)
            return matrix[0][index] + max(lens)
            
    else:
        if len(non_zero_indexes) == 0:
            # если в последнем слое только нули - возвращаем текущую длинну пути
            return 0
        else:
            # ищем максимальную длинну в последнем слое
            lens = []
            for index in non_zero_indexes:
                len_through_index = matrix[0][index]
                lens.append(len_through_index)
            return max(lens)

def diff_bbs(bbs1, bbs2):
    """
        0 - похожи, 1 - не похожи
    """
    matrix = np.zeros((len(bbs1), len(bbs2)))
    
    for i, bb1 in enumerate(bbs1):
        for j, bb2 in enumerate(bbs2):
            matrix[i][j] = diff_iou(bb1, bb2)
    
    similarity_rows = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity_columnes = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity = max(similarity_rows, similarity_columnes)
    return similarity

clusters_tmp = []
def load_clusters():
    """
        загрузить из БД все кластеры (их id и bbs)
    """
    items = Cluster.query.all() # .order_by(Item.user_id)

    clusters = []
    for cluster in json.loads(json.dumps(items, cls=AlchemyEncoder)):
        bbs = cluster['bbs']
        id = cluster['id']
        clusters.append({'id': id, 'bbs': bbs})

    return [{'id': id, 'bbs': bbs} for id, bbs in enumerate(clusters_tmp)] #clusters
    

def create_cluster(bbs):
    """
        создать в БД новый кластер
    """
    clusters_tmp.append(bbs)
    
    cluster = Cluster(bbs=bbs)
    try:
        db.session.add(cluster)
        db.session.commit()
    except Exception as e:
        print(str(e))
        
    return clusters_tmp.index(bbs) # cluster.id
        
def add_certificate(cert, session_id=-1):
    """
        добавить сертификат в БД
    """
    
    certificate = Сertificate(
                          cluster_id = cert['cluster_id'],
                          image_url = cert['image_url'],
                          text_from_image = cert['text_from_image'],
                          bbs = cert['bbs'],
                          user_id = cert['user_id'],
                          post_id = cert['post_id'],
                          from_id_session = session_id)
    
    try:
        db.session.add(certificate)
        db.session.commit()
        db.session.close()
    except Exception as e:
        print(str(e))
    
def update_cluster_centroids(bbs):
    """
        обновить значение "среднего" bbs у кластера
    """
    return 0

def clusterize(cert_data):
    cert_bbs = cert_data['bbs'][:20]
    #text = cert_data['text_from_image']
    
    clusters = load_clusters() # [{'id': 2, 'bbs': []}, {...}, {...}]
    
    top_cluster_id = None
    top_similar = 0
    for cluster in clusters:
        cluster_bbs = cluster['bbs'][:20]
        cluster_id = cluster['id']
        
        similarity = diff_bbs(cluster_bbs, cert_bbs)
        
        #print('{} => {}'.format(cluster['id'], similarity))
        if (similarity < 0.8) and (similarity > top_similar):
            top_similar = similarity
            top_cluster_id = cluster_id
    
    if top_cluster_id is None:
        top_cluster_id = create_cluster(bbs=cert_bbs)
    else:
        update_cluster_centroids(bbs=cert_bbs)
        
    return top_cluster_id

all_certs_data = []
for cert_data in all_certs:
    cluster_id = clusterize(cert_data)
    cert_data.update({'cluster_id': cluster_id})
    #add_certificate(cert_data)
    all_certs_data.append(cert_data)
    
print(len(clusters_tmp))

In [None]:
def diff_bbs(bbs1, bbs2):
    """
        0 - похожи, 1 - не похожи
    """
    matrix = np.zeros((len(bbs1), len(bbs2)))
    
    for i, bb1 in enumerate(bbs1):
        for j, bb2 in enumerate(bbs2):
            matrix[i][j] = diff_iou(bb1, bb2)
    
    similarity_rows = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity_columnes = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity = max(similarity_rows, similarity_columnes)
    return similarity, matrix

bbs1 = all_certs[0]['bbs']
bbs2 = all_certs[2]['bbs']

diff_bbs(bbs1, bbs2)

In [None]:
items = Cluster.query.all() # .order_by(Item.user_id)
len(json.loads(json.dumps(items, cls=AlchemyEncoder)))

In [None]:
from collections import defaultdict

def load_images_data():
    items = Сertificate.query.all() # .order_by(Item.user_id)

    certificates = defaultdict(list)
    for cert_data in all_certs_data: #json.loads(json.dumps(items, cls=AlchemyEncoder)):
        image = cert_data['image']
        cluster_id = cert_data['cluster_id']
        
        certificates[cluster_id].append(image)
    
    return dict(certificates)

clusters = load_images_data()

max_posts_in_cluster = max([len(cluster_posts) for _, cluster_posts in clusters.items()])
print(len(clusters))
print(max_posts_in_cluster)
fig, axis = plt.subplots(len(clusters), max_posts_in_cluster, figsize=(10, 10*((len(clusters)//max_posts_in_cluster)+1)))

for cluster_id, (_, cluster_posts) in enumerate(clusters.items()):
    for index_post, cluster_post in enumerate(cluster_posts):
        image = cluster_post # image cert_img
        if image == None:
            continue

        axis[cluster_id][index_post].imshow(image)
        #axis[cluster_id][index_post].annotate(cluster_post['post_url'].split('wall')[1], xy=(0.5, 10), xytext=(0, 10))
        
        #cert_coords = cluster_post['cert_coords']
        #rect = patches.Rectangle((coords['x']*image.size[0],coords['y'])*image.size[1],coords['w']*image.size[0],coords['h']*image.size[1],linewidth=1,edgecolor='r',facecolor='none')
        #axis[cluster_id][index_post].add_patch(rect)
        
plt.savefig('clusters_of_users.png')
#plt.show()

In [None]:
from matplotlib import pyplot as plt
from IPython.display import clear_output

image = download_image(certs_of_user[2]['image_url'])

fig, ax = plt.subplots()
ax.imshow(image)
plt.show()

In [None]:
%%time
import os

def get_cert_bbs(image):
    data = pytesseract.image_to_data(image, lang='rus').split('\n')
    all_rows = [row.split('\t') for row in data]

    columt_names = all_rows[0]
    data_rows = all_rows[1:]

    textes_data = [dict(zip(columt_names, row)) for row in data_rows]

    bbs = []
    for text_data in textes_data:
        try:
            if len(text_data['text'].replace(' ', '')) > 2:
                img_w = image.size[0]
                img_h = image.size[1]
                bbs.append({'text': text_data['text'],
                               'y': int(text_data['top'])/img_h, 
                               'x': int(text_data['left'])/img_w, 
                               'w': int(text_data['width'])/img_w, 
                               'h':int(text_data['height'])/img_h})
        except Exception as e:
            pass
        
    return bbs


certs_path = 'certificat_detect/saved_data/'
users_has_cert = []
for file_name in os.listdir(certs_path):
    
    if not '.txt' in file_name:
        continue
        
    with open(certs_path + file_name) as f:
        user_data = json.loads(f.read())
        
    user_id = user_data['user_id']
    posts_url_and_text = user_data['posts_url_and_text']
    
    for post_url_and_text in posts_url_and_text:
        try:
            post_url = post_url_and_text['post_url']
            text_from_image = post_url_and_text['text_from_image']
            image_url = post_url_and_text['image_url']

            res = requests.get(image_url)
            image = Image.open(BytesIO(res.content))
            k = 1 # 256 / max(image.size)
            w = int(image.size[0] * k)
            h = int(image.size[1] * k)
            #image224 = image.resize((224, 224))
            #image_features = classifier.predict(np.array([np.array(image224)]))[0]

            try:
                cert_coords = get_cert_coord(image)
                #cert_img = get_cert_img(image)

            except Exception as e:
                cert_coords = None
                #cert_img = None
                print(str(e))

            users_has_cert.append({'user_id': user_id, 
                                   'post_url': post_url, 
                                   'image_url': image_url,
                                   'text_from_image': text_from_image, 
                                   'image': image,
                                   'cert_coords': cert_coords, 
                                   'aspect_ratio': w/h})
        except Exception as e:
            print(str(e))
            pass
        
len(users_has_cert)

In [None]:
def diff_iou(bb1, bb2):
    """
    bb : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    """

    if (bb1 == None) or (bb2 == None):
        return 0.0
    # determine the coordinates of the intersection rectangle
    bb1 = {'x1': bb1['x'], 'x2': bb1['x'] + bb1['w'],'y1': bb1['y'],'y2': bb1['y'] + bb1['h'], 'text': bb1['text']}
    bb2 = {'x1': bb2['x'], 'x2': bb2['x'] + bb2['w'],'y1': bb2['y'],'y2': bb2['y'] + bb2['h'], 'text': bb2['text']}
    
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    l1 = len(bb1['text'])
    l2 = len(bb2['text'])
    levin_dist = 1 - distance(bb1['text'], bb2['text']) / (l1 + l2)
    iou = iou * levin_dist
    
    assert iou >= 0.0
    assert iou <= 1.0
    return iou


In [None]:
def find_longer_dist(matrix):
    # идем горизонтальными полосами
    non_zero_indexes = matrix[0].nonzero()[0]
    
    if matrix.shape[0] > 1:
        if len(non_zero_indexes) == 0:
            return find_longer_dist(matrix[1:])
        else:
            lens = []
            for index in non_zero_indexes:
                len_through_index = find_longer_dist(matrix[1:])
                lens.append(len_through_index)
            return matrix[0][index] + max(lens)
            
    else:
        if len(non_zero_indexes) == 0:
            # если в последнем слое только нули - возвращаем текущую длинну пути
            return 0
        else:
            # ищем максимальную длинну в последнем слое
            lens = []
            for index in non_zero_indexes:
                len_through_index = matrix[0][index]
                lens.append(len_through_index)
            return max(lens)


In [None]:
def diff_bbs(bbs1, bbs2):
    """
        0 - похожи, 1 - не похожи
    """
    matrix = np.zeros((len(bbs1), len(bbs2)))
    
    for i, bb1 in enumerate(bbs1):
        for j, bb2 in enumerate(bbs2):
            matrix[i][j] = diff_iou(bb1, bb2)
    
    similarity_rows = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity_columnes = 1 - find_longer_dist(matrix) / matrix.shape[0]
    similarity = max(similarity_rows, similarity_columnes)
    return similarity, matrix

bbs1 = users_has_cert[0]['cert_coords'][:10]
bbs2 = users_has_cert[2]['cert_coords'][:10]

s, m = diff_bbs(bbs1, bbs2)
m

In [None]:
%%time
import numpy as np
from IPython.display import clear_output


def sort_biggest_BB(bbs):
    return sorted(bbs, key=lambda bb: (bb['w']*bb['h']), reverse=True)


dist_matrix = np.zeros((len(users_has_cert[:80]), len(users_has_cert[:80])))

for i in range(len(users_has_cert[:80])):
    
    clear_output()
    print('i = {}\n'.format(i))
    
    for j in range(i, len(users_has_cert[:80])):
        
        if i == j:
            dist_matrix[i][j] = 0
        else:
            
            cert_coords1 = sort_biggest_BB(users_has_cert[i]['cert_coords'])[:10]
            cert_coords2 = sort_biggest_BB(users_has_cert[j]['cert_coords'])[:10]
            dist5, _ = diff_bbs(cert_coords1, cert_coords2)
            
            dist_matrix[i][j] = dist5
            dist_matrix[j][i] = dist5
            
print(dist_matrix.max())
dist_matrix

In [None]:
# найти средний отпечаток

dist_matrix

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
eps = 1.2
clustering = DBSCAN(eps=eps, min_samples=1).fit(dist_matrix) # 01
cluster_inds = clustering.labels_
print((cluster_inds==1).sum())
print(cluster_inds.max())
cluster_inds

In [None]:
clusters = {cluster_ind: [] for cluster_ind in cluster_inds}

for cluster_ind, user_has_cert in zip(cluster_inds, users_has_cert):
    clusters[cluster_ind].append(user_has_cert)
    

import matplotlib.pyplot as plt

max_posts_in_cluster = max([len(cluster_posts) for _, cluster_posts in clusters.items()])
fig, axis = plt.subplots(len(clusters), max_posts_in_cluster, figsize=(10, 10*len(clusters)//max_posts_in_cluster))

for cluster_id, cluster_posts in clusters.items():
    for index_post, cluster_post in enumerate(cluster_posts):
        image = cluster_post['image'] # image cert_img
        if image == None:
            continue
            
        
        axis[cluster_id][index_post].imshow(image)
        axis[cluster_id][index_post].annotate(cluster_post['post_url'].split('wall')[1], xy=(0.5, 10), xytext=(0, 10))
        
        #cert_coords = cluster_post['cert_coords']
        #rect = patches.Rectangle((coords['x']*image.size[0],coords['y'])*image.size[1],coords['w']*image.size[0],coords['h']*image.size[1],linewidth=1,edgecolor='r',facecolor='none')
        #axis[cluster_id][index_post].add_patch(rect)
        
plt.savefig(str(eps) + 'clusters_of_users.png')

In [None]:
#находим средний отпечаток кластера
clusters[0]

In [None]:
# визуализируем один из кластеров с его BB
cluster_id, cluster_posts = 0, clusters[0]

fig, axis = plt.subplots(1, len(cluster_posts), figsize=(20, 10))

for index_post, cluster_post in enumerate(cluster_posts):
    image = cluster_post['image'] # image cert_img
    if image == None:
        continue


    axis[index_post].imshow(image)
    axis[index_post].annotate(cluster_post['post_url'].split('wall')[1], xy=(0.5, 10), xytext=(0, 10))

    all_coords = cluster_post['cert_coords']
    for coords in all_coords:
        rect = patches.Rectangle((coords['x']*image.size[0],coords['y']*image.size[1]),coords['w']*image.size[0],coords['h']*image.size[1],linewidth=1,edgecolor='r',facecolor='none')
        axis[index_post].add_patch(rect)

In [None]:
my_cluster = MyCluster()

for post in cluster_posts[:5]:
    my_cluster.add_post(post)

In [None]:
class MyCluster:
    def __init__(self):
        self.posts = []
        self.mean_bbs = None
        
    def analyze_post(self, post):
        post_bbs = self.sort_biggest_BB(post['cert_coords'])[:10]
            
        return diff_bbs(self.mean_bbs, post_bbs)
    
    
    def add_post(self, post):
        
        self.posts.append(post)
        
        # если мы впервые добавляем пост в кластер, центроида - сам пост
        if self.mean_bbs is None:
            self.mean_footprint = post_bbs
        
        # обновляем центроиду каждый раз, пока не дойдем до 5 постов
        elif len(self.posts) <= 5:
            bbs_matrix = np.zeros((512, 512))
            for post in self.posts:
                post_bbs = self.sort_biggest_BB(post['cert_coords'])[:10]
                
        
        
    def sort_biggest_BB(self, bbs):
        return sorted(bbs, key=lambda bb: (bb['w']*bb['h']), reverse=True)
    
    def diff_bbs(self, bbs1, bbs2):
        """
            0 - похожи, 1 - не похожи
        """
        matrix = np.zeros((len(bbs1), len(bbs2)))

        for i, bb1 in enumerate(bbs1):
            for j, bb2 in enumerate(bbs2):
                matrix[i][j] = self.diff_iou(bb1, bb2)

        similarity_rows = 1 - self.find_longer_dist(matrix) / matrix.shape[0]
        similarity_columnes = 1 - self.find_longer_dist(matrix) / matrix.shape[0]
        similarity = max(similarity_rows, similarity_columnes)
        return similarity
    
    def find_longer_dist(self, matrix):
        # идем горизонтальными полосами
        non_zero_indexes = matrix[0].nonzero()[0]

        if matrix.shape[0] > 1:
            if len(non_zero_indexes) == 0:
                return self.find_longer_dist(matrix[1:])
            else:
                lens = []
                for index in non_zero_indexes:
                    len_through_index = self.find_longer_dist(matrix[1:])
                    lens.append(len_through_index)
                return matrix[0][index] + max(lens)

        else:
            if len(non_zero_indexes) == 0:
                # если в последнем слое только нули - возвращаем текущую длинну пути
                return 0
            else:
                # ищем максимальную длинну в последнем слое
                lens = []
                for index in non_zero_indexes:
                    len_through_index = matrix[0][index]
                    lens.append(len_through_index)
                return max(lens)
            
    def diff_iou(self, bb1, bb2):
        """
        bb : dict
            Keys: {'x1', 'x2', 'y1', 'y2'}
            The (x1, y1) position is at the top left corner,
            the (x2, y2) position is at the bottom right corner
        """

        if (bb1 == None) or (bb2 == None):
            return 0.0
        # determine the coordinates of the intersection rectangle
        bb1 = {'x1': bb1['x'], 'x2': bb1['x'] + bb1['w'],'y1': bb1['y'],'y2': bb1['y'] + bb1['h'], 'text': bb1['text']}
        bb2 = {'x1': bb2['x'], 'x2': bb2['x'] + bb2['w'],'y1': bb2['y'],'y2': bb2['y'] + bb2['h'], 'text': bb2['text']}

        x_left = max(bb1['x1'], bb2['x1'])
        y_top = max(bb1['y1'], bb2['y1'])
        x_right = min(bb1['x2'], bb2['x2'])
        y_bottom = min(bb1['y2'], bb2['y2'])

        if x_right < x_left or y_bottom < y_top:
            return 0.0

        # The intersection of two axis-aligned bounding boxes is always an
        # axis-aligned bounding box
        intersection_area = (x_right - x_left) * (y_bottom - y_top)

        # compute the area of both AABBs
        bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
        bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

        # compute the intersection over union by taking the intersection
        # area and dividing it by the sum of prediction + ground-truth
        # areas - the interesection area
        iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
        l1 = len(bb1['text'])
        l2 = len(bb2['text'])
        levin_dist = 1 - distance(bb1['text'], bb2['text']) / (l1 + l2)
        iou = iou * levin_dist

        assert iou >= 0.0
        assert iou <= 1.0
        return iou

In [None]:
class MyClusters:
    def __init__(self):
        self.clusters = []
        

In [None]:
%%time
import xlsxwriter

workbook = xlsxwriter.Workbook('posts.xlsx')
worksheet = workbook.add_worksheet()
max_posts_in_cluster = max([len(cluster_posts) for _, cluster_posts in clusters.items()])
worksheet.set_column(0, max_posts_in_cluster, 54)

def int_to_char(i):
    chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    char_int = ''
    while i > len(chars)-1:
        char_int += 'Z'
        i -= len(chars)
    char_int += chars[i]
    return char_int
        
for cluster_id, cluster in clusters.items():
    for post_ind, post in enumerate(cluster):
        image = post['image']
        k = 380 / max(image.size)
        image = image.resize((int(image.size[0]*k), int(image.size[1]*k)))
        
        image_name = 'cl_{}_p_{}'.format(cluster_id, post_ind)
        image.save('result/{}.png'.format(image_name))
        
        row_ind = cluster_id * 2
        
        cell_ind = '{}{}'.format(int_to_char(post_ind), row_ind + 1)
        worksheet.insert_image(cell_ind, 'result/{}.png'.format(image_name))
        worksheet.write_url(cell_ind, post['post_url'])
        
        worksheet.set_row(row_ind, 300)
        
        cell_ind = '{}{}'.format(int_to_char(post_ind), (cluster_id+1)*2)
        worksheet.write(cluster_id * 2 + 1, post_ind, post['user_id'])

    
workbook.close()

In [None]:
current_cluster = clusters[25]
fig, axis = plt.subplots(len(current_cluster), figsize=(10,10*len(current_cluster)))

for index_post, cluster_post in enumerate(current_cluster):
    image = cluster_post['image']
    if image == None:
        continue

    axis[index_post].imshow(image)
    axis[index_post].annotate(cluster_post['post_url'].split('wall')[1],
        xy=(0.5, 10), xytext=(0, 10))
        
plt.show()

In [None]:
# 10 101 138
def get_cert_img(image):
    coords = get_cert_coord(image)
    cert_img = image.crop([coords['x'], coords['y'], coords['w'] + coords['x'], coords['h'] + coords['y']])
    return cert_img.resize((224, 32))

i = 3
j = 138

for i, user_has_cert in enumerate(users_has_cert):
    if  '_' + '3426' in user_has_cert['post_url']:
        break
        
for j, user_has_cert in enumerate(users_has_cert):
    if  '_' + '1033' in user_has_cert['post_url']:
        break
        

    

image1 = clusters[29][0]['image']
coords1 = sord_biggest_BB(clusters[29][0]['cert_coords'])[:20]# get_cert_coord(image1)
image2 = clusters[35][0]['image']
coords2 = sord_biggest_BB(clusters[35][0]['cert_coords'])[:20] #get_cert_coord(image2)
#cert_img = users_has_cert[i]['cert_img'] # get_cert_img(image)

    
import matplotlib.patches as patches

fig, ax = plt.subplots(1, 2, figsize=(20,10))
ax[0].imshow(image1)
for coord in coords1:
    rect = patches.Rectangle((coord['x']*image1.size[0],coord['y']*image1.size[1]),coord['w']*image1.size[0],coord['h']*image1.size[1],linewidth=1,edgecolor='r',facecolor='none')
    ax[0].add_patch(rect)

ax[1].imshow(image2)
for coord in coords2:
    rect = patches.Rectangle((coord['x']*image2.size[0],coord['y']*image2.size[1]),coord['w']*image2.size[0],coord['h']*image2.size[1],linewidth=1,edgecolor='r',facecolor='none')
    ax[1].add_patch(rect)

plt.show()

In [None]:
bbs = users_has_cert[i]['cert_coords']
sorted(bbs, key=lambda bb: (bb['w']*bb['h']))

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
eps = 0.5
clustering = DBSCAN(eps=eps, min_samples=1).fit(dist_matrix) # 01
cluster_inds = clustering.labels_
print(max([len(cluster_posts) for _, cluster_posts in clusters.items()]))
print((cluster_inds==0).sum())
cluster_inds

clusters = {cluster_ind: [] for cluster_ind in cluster_inds}

for cluster_ind, user_has_cert in zip(cluster_inds, users_has_cert):
    clusters[cluster_ind].append(user_has_cert)
    
    
    
import matplotlib.pyplot as plt

max_posts_in_cluster = max([len(cluster_posts) for _, cluster_posts in clusters.items()])
fig, axis = plt.subplots(len(clusters), max_posts_in_cluster, figsize=(20, 20*len(clusters)//max_posts_in_cluster))

for cluster_id, cluster_posts in clusters.items():
    for index_post, cluster_post in enumerate(cluster_posts):
        image = cluster_post['image'] # image cert_img
        if image == None:
            continue
            
        
        axis[cluster_id][index_post].imshow(image)
        axis[cluster_id][index_post].annotate(cluster_post['post_url'].split('wall')[1],
            xy=(0.5, 10), xytext=(0, 10))
        
        cert_coords = cluster_post['cert_coords']
        rect = patches.Rectangle((coords['x']*image.size[0],coords['y'])*image.size[1],coords['w']*image.size[0],coords['h']*image.size[1],linewidth=1,edgecolor='r',facecolor='none')
        axis[cluster_id][index_post].add_patch(rect)
        
#plt.show()
plt.savefig(str(int(eps*10)) + 'my_fig.png')

In [None]:
from PIL import Image
import pytesseract

from Levenshtein import distance

In [None]:
%%time
headers = {'Content-type': 'application/json',
           'Accept': 'text/plain',
           'Content-Encoding': 'utf-8'}
res = requests.post('http://127.0.0.1:5000/analyze_users', headers=headers, data=json.dumps({'users_ids': ['72092127']}))
res.json()

In [None]:
def get_members (group_id, offset, count):
    access_token = '25527794e79a323559f47c29b1df2c3b6f1eb91d1f818a6c02867d4bf12c57fb7a8e3dc6830bc046ba482'
    version = '5.37'

    link = "https://api.vk.com/method/groups.getMembers?access_token=" + access_token + "&group_id=" + group_id + "&v=" + version + "&count=" + count + "&offset=" + offset
    res = requests.get(link)
    return res.json()['response']['items']

def get_all_members(group_id):
    members = []
    old_members = [0] * 1000
    i = 0

    while len(old_members) == 1000:
        old_members = get_members (group_id, str(i*1000), '1000')
        i += 1
        members += old_members
        
    return members

res = get_all_members('168297642')
len(res)

In [None]:
%%time
# If you don't have tesseract executable in your PATH, include the following:
#pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

# Simple image to string
def find_in_text(text, target_words=['сертификат']):
    for target_word in target_words:
        for line in text.split('\n'):
            for word in line.split(' '):
                if len(word) > 5:
                    word = word.lower()
                    dist = distance(word, target_word)
                    if dist < 4:
                        return True
    return False

text = pytesseract.image_to_string(Image.open('2.jpg'), lang='rus')
find_in_text(text, target_words=['сертификат'])

In [None]:
import json
import requests


In [None]:
def posts_from_wall (ownerId, count, offset):
    access_token = '25527794e79a323559f47c29b1df2c3b6f1eb91d1f818a6c02867d4bf12c57fb7a8e3dc6830bc046ba482'
    version = '5.37'

    filter = "owner"
    extended = '0'

    link = "https://api.vk.com/method/wall.get?access_token=" + access_token + "&owner_id=" + ownerId + "&v=" + version + "&extended=" + extended + "&filter=" + filter + "&count=" + count + "&offset=" + offset
    res = requests.get(link)
    return res.json()

res = posts_from_wall('2736290', '30', '0')

In [None]:
def extract_images_data_from_res(res):
    images_paths = []
    for index, item in enumerate(res['response']['items']):
        try:
            attachments = item['attachments']

            for attachment in attachments:
                versions_of_photo = []
                if attachment['type'] == 'photo':
                    photo = attachment['photo']
                    for key, path in photo.items():
                        if 'photo' in key:
                            versions_of_photo.append({'size': int(key.replace('photo_', '')), 'path': path})

                versions_of_photo = sorted(versions_of_photo, key=lambda k: k['size'], reverse=True)
                best_photo_path = versions_of_photo[0]['path']

                post_url = 'https://vk.com/id{}?w=wall{}_{}'.format(item['from_id'], item['from_id'], item['id'])
                images_paths.append({'photo_url': best_photo_path, 'post_url': post_url})
        except Exception as e:
            pass
        
    return images_paths
    
images_paths = extract_images_data_from_res(res)
images_paths

In [None]:
from io import BytesIO

def load_images(images_paths):
    images = []
    for images_data in images_paths:
        photo_url = images_data['photo_url']
        post_url = images_data['post_url']

        res = requests.get(photo_url)
        image = Image.open(BytesIO(res.content))
        images.append({'image': image, 'post_url': post_url})
        
    return images
        
images = load_images(images_paths)

In [None]:
def find_in_text(text, target_words=['сертификат']):
    for target_word in target_words:
        for line in text.split('\n'):
            for word in line.split(' '):
                if len(word) > 5:
                    word = word.lower()
                    dist = distance(word, target_word)
                    if dist < 4:
                        return True
    return False

def get_posts_with_certs(images):
    certs = []
    for image_data in images:
        image = image_data['image']
        post_url = image_data['post_url']

        text = pytesseract.image_to_string(image, lang='rus')
        if find_in_text(text, target_words=['сертификат']):
            certs.append(post_url)
    return certs
            
certs = get_posts_with_certs(images)
certs

In [None]:
import json
import requests

class TextFromVK ():
    def downloadPostsFromWall (self, ownerId, count, offset):
        access_token = '25527794e79a323559f47c29b1df2c3b6f1eb91d1f818a6c02867d4bf12c57fb7a8e3dc6830bc046ba482'
        version = '5.37'

        filter = "owner"
        extended = '0'

        link = "https://api.vk.com/method/wall.get?access_token=" + access_token + "&owner_id=" + ownerId + "&v=" + version + "&extended=" + extended + "&filter=" + filter + "&count=" + count + "&offset=" + offset
        f = urllib.request.urlopen(link)
        return json.loads(f.read())
    
    def deEmojify(self, inputString):
        allchars = [str for str in inputString]
        emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
        clean_text = ' '.join([str for str in inputString.split() if not any(i in str for i in emoji_list)])
        return clean_text
    
    def getArrOfTexts (self, groupId, count, offset):
        jsonRes = self.downloadPostsFromWall (groupId, count, offset)
        arrOfText = []
        #reg = re.compile(r'^0-9A-Za-zА-Яа-я ')
        for item in jsonRes['response']['items']:
            if (item['text'] != ''):
                arrOfText.append(self.deEmojify(item['text']).replace('\n', ''))
        return arrOfText