In [1]:
import pandas as pd
import numpy as np
import os
import json
from rq import Queue
from redis import Redis

import time
import parse_utils
from extract_info_from_input_8 import extract_info_from_df
import copy
import db_utils
import search_utils
from search_theme_color import theme_compare
import match_utils



In [2]:
text_info_columns = ['file_name','name','info_key',
                         "font_size","color","margin_right","letter_spacing",
                         "text_position","punctuation_wrap","margin_left","font_name",
                         "font_style","margin_bottom","content","font_variant",
                         "font_weight","font_size_asian","text_underline_style","text_indent",
                         "text_shadow","text_line_through_style","text_align","writing_mode",
                         "font_name_asian","language","margin_top","text_transform",
                         "line_height"]

common_columns = ['subject', 'class', 'type', 'name',
                  'width', 'height','file_name', 'type_unique',
                  'key','estimate_title','font_size_estimate_title']
# 这里的shape指的元素
shape_columns = ['shape_height', 'shape_width', 'shape_x', 'shape_y',
                 'shape_area','shape_num','text_num', 'over_shape_flag',
                 'over_shape_num', 'over_shape_area','over_shape_area_rate', 'over_text_flag',
                 'over_text_num', 'over_text_area','over_text_area_rate']
# 这里的text指的是文本框
text_columns = [ 'content_size','content','line', 'slope',
                 'language','font-size-asian', 'font-name-asian', 'font-weight', 'font-name',
                 'font-style', 'font-size'] #这里的language到font-size指的是文本框中的第一个

all_columns = common_columns + shape_columns + text_columns


##################################################
#                  加载数据库
##################################################
version = 'v4'
# db_path = '/home/nd/ppt_server/ppt_auto_layout/datasets/data_ppt'
db_path =  r'E:\Dataset\datasets\datasets\data_ppt'
file1_db = '29_work_template'
image_db = pd.read_csv(os.path.join(db_path,version,'image',file1_db,'ppt_color.csv'))
layout_db_list = []
for i in range(6):
    layout_db_list.append(pd.read_csv(os.path.join(db_path,version,'db',file1_db,'ppt_scd_' + str(i) + '.csv')))
all_db = pd.read_csv(os.path.join(db_path,version,'all',file1_db,'all.csv'))
text_db = pd.read_csv(os.path.join(db_path,version,'all',file1_db,'text.csv'))


In [3]:
for df in layout_db_list:
    print(df.shape)

(2435, 39)
(163, 39)
(159, 39)
(123, 39)
(36, 39)
(10, 39)


In [4]:
layout_db_list[0].columns

Index(['avg_graphic_area', 'avg_shape_area', 'avg_text_area',
       'avg_text_content', 'chart_num', 'estimate_title_num', 'file_name',
       'font_size_estimate_title_num', 'graphic_num', 'height',
       'max_font_group', 'max_font_group_num', 'max_graphic_area',
       'max_graphic_group', 'max_graphic_group_num', 'max_shape_area',
       'max_shape_group', 'max_text_area', 'max_text_area_group',
       'max_text_area_group_avg_content_size', 'max_text_content',
       'max_text_font_group_avg_content_size', 'max_text_group',
       'max_text_group_num', 'new_title_num', 'outline_num', 'shape_num',
       'table_num', 'text_num', 'title_num', 'width', 'page_type', 'text_type',
       'chart_type', 'tuwen_num', 'tuwen_layout', 'tuwen_color', 'rgraph_type',
       'rgpaph_num'],
      dtype='object')

In [5]:
pd.set_option('max_columns',50)
pd.set_option('colwidth',100)
layout_db_list[0].head()

Unnamed: 0,avg_graphic_area,avg_shape_area,avg_text_area,avg_text_content,chart_num,estimate_title_num,file_name,font_size_estimate_title_num,graphic_num,height,max_font_group,max_font_group_num,max_graphic_area,max_graphic_group,max_graphic_group_num,max_shape_area,max_shape_group,max_text_area,max_text_area_group,max_text_area_group_avg_content_size,max_text_content,max_text_font_group_avg_content_size,max_text_group,max_text_group_num,new_title_num,outline_num,shape_num,table_num,text_num,title_num,width,page_type,text_type,chart_type,tuwen_num,tuwen_layout,tuwen_color,rgraph_type,rgpaph_num
0,236.87685,417.385675,66.622979,53.0,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo1.xml.json,1,1,19.05,1,0,236.87685,1,0,701.605476,1,111.12496,0.0,0.0,102.0,0.0,1,0,0,0,5,0,2,0,33.867,0,0,0,1,7,1,0,0
1,329.775462,324.81121,35.583905,27.25,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,1,1,19.05,2,1,329.775462,1,0,645.113434,1,58.821308,58.821308,52.0,52.0,52.0,2,1,0,0,5,0,4,0,33.867,0,0,0,1,8,1,0,0
2,283.525461,319.908225,44.476255,32.0,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo3.xml.json,1,2,19.05,3,1,357.525306,1,0,645.82462,1,61.671701,61.671701,52.0,52.0,41.333333,2,1,0,0,6,0,4,0,33.867,0,0,0,1,9,1,0,0
3,539.715317,320.171884,34.226064,26.0,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo4.xml.json,1,1,19.05,2,1,539.715317,1,0,645.82462,2,55.122067,0.0,0.0,52.0,52.0,1,0,0,0,6,0,5,0,33.867,0,0,0,1,8,1,0,0
4,0.0,348.651571,72.309172,60.0,1,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo5.xml.json,1,0,19.05,2,1,0.0,0,0,719.860743,1,162.3659,0.0,0.0,156.0,88.0,1,0,0,0,7,0,3,0,33.867,0,0,1,0,0,0,0,0


In [6]:
layout_db_list[0]

Unnamed: 0,avg_graphic_area,avg_shape_area,avg_text_area,avg_text_content,chart_num,estimate_title_num,file_name,font_size_estimate_title_num,graphic_num,height,max_font_group,max_font_group_num,max_graphic_area,max_graphic_group,max_graphic_group_num,max_shape_area,max_shape_group,max_text_area,max_text_area_group,max_text_area_group_avg_content_size,max_text_content,max_text_font_group_avg_content_size,max_text_group,max_text_group_num,new_title_num,outline_num,shape_num,table_num,text_num,title_num,width,page_type,text_type,chart_type,tuwen_num,tuwen_layout,tuwen_color,rgraph_type,rgpaph_num
0,236.876850,417.385675,66.622979,53.000000,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo1.xml.json,1,1,19.05,1,0,236.876850,1,0,701.605476,1,111.124960,0.000000,0.0,102.0,0.000000,1,0,0,0,5,0,2,0,33.867,0,0,0,1,7,1,0,0
1,329.775462,324.811210,35.583905,27.250000,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,1,1,19.05,2,1,329.775462,1,0,645.113434,1,58.821308,58.821308,52.0,52.0,52.000000,2,1,0,0,5,0,4,0,33.867,0,0,0,1,8,1,0,0
2,283.525461,319.908225,44.476255,32.000000,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo3.xml.json,1,2,19.05,3,1,357.525306,1,0,645.824620,1,61.671701,61.671701,52.0,52.0,41.333333,2,1,0,0,6,0,4,0,33.867,0,0,0,1,9,1,0,0
3,539.715317,320.171884,34.226064,26.000000,0,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo4.xml.json,1,1,19.05,2,1,539.715317,1,0,645.824620,2,55.122067,0.000000,0.0,52.0,52.000000,1,0,0,0,6,0,5,0,33.867,0,0,0,1,8,1,0,0
4,0.000000,348.651571,72.309172,60.000000,1,0,20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo5.xml.json,1,0,19.05,2,1,0.000000,0,0,719.860743,1,162.365900,0.000000,0.0,156.0,88.000000,1,0,0,0,7,0,3,0,33.867,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2430,0.000000,331.246842,53.403669,33.666667,0,0,20200714 【已筛选】简约通用风格-纯文本（99页）/逼格PPT-【NEW】black+for+free-（6页）/lo1.xml.json,1,0,19.05,1,0,0.000000,0,0,503.910036,1,88.693615,0.000000,0.0,87.0,0.000000,1,0,0,0,2,0,3,0,33.867,0,3,0,0,0,0,0,0
2431,0.000000,12.283110,18.228586,13.500000,0,0,20200714 【已筛选】简约通用风格-纯文本（99页）/逼格PPT-【NEW】black+for+free-（6页）/lo2.xml.json,1,0,19.05,3,3,0.000000,0,0,12.283110,1,69.839559,22.356044,35.0,35.0,4.000000,3,3,0,0,1,0,10,0,33.867,0,1,0,0,0,0,0,0
2432,0.000000,0.000000,328.600769,5.500000,0,0,20200714 【已筛选】简约通用风格-纯文本（99页）/逼格PPT-【NEW】black+for+free-（6页）/lo3.xml.json,0,0,19.05,1,0,0.000000,0,0,0.000000,0,367.756596,0.000000,0.0,11.0,0.000000,1,0,0,0,0,0,2,0,33.867,0,3,0,0,0,0,0,0
2433,0.000000,2.059601,55.918107,67.000000,0,0,20200714 【已筛选】简约通用风格-纯文本（99页）/逼格PPT-【NEW】black+for+free-（6页）/lo4.xml.json,0,0,19.05,2,2,0.000000,0,0,2.059601,14,112.307830,7.974498,6.0,235.0,6.000000,2,1,0,0,14,0,5,0,33.867,0,3,0,0,0,0,0,0


In [7]:
a = r'E:\Dataset\datasets\datasets\data_ppt\v4\info\29_work_template\20200423 【已筛选】优秀PPT模板收集（112页）\动感欧美商务PPT模板（7页）\lo2.xml.json'

# with open(a,encoding='utf-8') as f:
#     file_json = json.loads(f.read())
# file_json
pd.read_csv(a)

Unnamed: 0,color,content,file_name,font_name,font_name_asian,font_size,font_size_asian,font_style,font_variant,font_weight,info_key,language,letter_spacing,line_height,margin_bottom,margin_left,margin_right,margin_top,name,punctuation_wrap,text_align,text_indent,text_line_through_style,text_position,text_shadow,text_transform,text_underline_style,writing_mode
0,#ffffff,CEO,29_work_template/20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,Myriad Pro,微软雅黑 Light2,18pt,18pt,normal,normal,normal,0,en,normal,100%,0cm,0cm,0cm,0cm,矩形 16,hanging,start,0cm,none,0% 100%,none,none,none,lr-tb
1,#1ebcad,TEXT HERE,29_work_template/20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,微软雅黑 Light,微软简1,44pt,44pt,normal,normal,bold,0,en,normal,100%,0cm,0cm,0cm,0cm,文本框 12,hanging,start,0cm,none,0% 100%,none,none,none,lr-tb
2,#ffffff,"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas porttitor congue massa. Fusce...",29_work_template/20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,Myriad Pro,微软简1,18pt,18pt,normal,normal,normal,0,en,normal,150%,0cm,0cm,0cm,0cm,文本框 13,hanging,start,0cm,none,0% 100%,none,none,none,lr-tb
3,#ffffff,"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas porttitor congue massa. Fusce...",29_work_template/20200423 【已筛选】优秀PPT模板收集（112页）/动感欧美商务PPT模板（7页）/lo2.xml.json,Myriad Pro,微软简1,18pt,18pt,normal,normal,normal,0,en,normal,150%,0cm,0cm,0cm,0cm,文本框 19,hanging,start,0cm,none,0% 100%,none,none,none,lr-tb


# 从原始json开始

In [9]:
a = 'E:\Dataset\ppt_template\LYPPT333【蛊】散逸了永恒的星云游-（15页）.pptx_2364b53a-ebc2-4d0a-a3ae-422c707424d0\LYPPT333【蛊】散逸了永恒的星云游-（15页）.pptx_0.json'
with open(a,encoding='utf-8') as f:
    file_json = json.loads(f.read())
file_json

{'AssemblyName': 'UDMPlugin',
 'TypeName': 'UDMPlugin.UDMPage',
 'size': {'AssemblyName': 'UDMPlugin',
  'TypeName': 'UDMPlugin.UDMSize',
  'width': '960.0095',
  'height': '540'},
 'pageIndex': '0',
 'fillProperties': {'AssemblyName': 'UDMPlugin',
  'TypeName': 'UDMPlugin.UDMFillProperties',
  'fillType': 'Solid',
  'solidFill': {'AssemblyName': 'UDMPlugin',
   'TypeName': 'UDMPlugin.UDMSolidFill',
   'color': {'AssemblyName': 'UDMPlugin',
    'TypeName': 'UDMPlugin.UDMColor',
    'r': '13',
    'g': '13',
    'b': '13',
    'a': '255'}},
  'dontRotateWithShape': 'False'},
 'itemList': [{'AssemblyName': 'UDMPlugin',
   'TypeName': 'UDMPlugin.UDMImage',
   'luminanceEffect': {'AssemblyName': 'UDMPlugin',
    'TypeName': 'UDMPlugin.UDMLuminanceEffect',
    'brightness': '0',
    'contrast': '0'},
   'imagePath': 'lobin.xmlbase64export2040080564.bin      ',
   'rectangle': {'AssemblyName': 'UDMPlugin',
    'TypeName': 'UDMPlugin.UDMRectangle',
    'point': {'AssemblyName': 'UDMPlugin',
 

In [13]:
t = 960.0095 / 33.867

959.9811/t,539.9717/t

(33.86599811116452, 19.049000623327167)

In [10]:
def get_settle_json(json_data):
    json_map = {}
    rate = 960.0095 / 33.867

    json_map['width'] = str(float(json_data['size']['width']) / rate)
    json_map['height'] = str(float(json_data['size']['height']) / rate)
    json_map['Texts'] = []
    json_map['Shapes'] = []
    json_map['all_uniqueId'] = json_data['uniqueId']
    for item in json_data['itemList']:
        item_type = item['type']
        if item_type == 'TextBox':
            rectangle = item['rectangle']
            point = rectangle['point']
            size = rectangle['size']
            text_one = {}
            text_one['x'] = str(float(point['X']) / rate)
            text_one['y'] = str(float(point['Y']) / rate)
            text_one['height'] = str(float(size['height']) / rate)
            text_one['width'] = str(float(size['width']) / rate)
            text_one['name'] = item['itemName']
            text_one['class'] = 'TextShape'
            infos = []
            # print(item.keys())
            if 'textBody' in item.keys():
                text_body = item['textBody']
                if 'paragraphList' in text_body.keys():
                    paragraph_list = text_body['paragraphList']
                    for one_paragraph in paragraph_list:
                        if 'textList' in one_paragraph.keys():
                            text_list = one_paragraph['textList']
                            for one_text in text_list:
                                info_one = {}
                                info_one['language'] = one_text['language']
                                info_one['font-size'] = one_text['fontSize']
                                info_one['font-size-asian'] = one_text['fontSize']
                                info_one['bold'] = one_text['bold']
                                info_one['font-weight'] = one_text['bold']
                                info_one['alternativeLanguage'] = one_text['alternativeLanguage']
                                info_one['textUnderline'] = one_text['textUnderline']
                                info_one['textStrike'] = one_text['textStrike']
                                info_one['capital'] = one_text['capital']
                                info_one['kumimoji'] = one_text['kumimoji']
                                info_one['italic'] = one_text['italic']
                                info_one['content'] = one_text['content']
                                info_one['spacing'] = one_text['spacing']
                                info_one['baseLine'] = one_text['baseLine']
                                info_one['font-style'] = ''
                                if 'eaFont' in one_text.keys():
                                    info_one['esFont_AssemblyName'] = one_text['eaFont']['AssemblyName']
                                    info_one['esFont_TypeName'] = one_text['eaFont']['TypeName']
                                    info_one['esFont_fontFamily'] = one_text['eaFont']['fontFamily']
                                    info_one['font-name-asian'] = one_text['eaFont']['fontFamily']
                                else:
                                    info_one['esFont_AssemblyName'] = ''
                                    info_one['esFont_TypeName'] = ''
                                    info_one['esFont_fontFamily'] = ''
                                    info_one['font-name-asian'] = ''

                                if 'latinFont' in one_text.keys():
                                    info_one['latinFont_AssemblyName'] = one_text['latinFont']['AssemblyName']
                                    info_one['latinFont_TypeName'] = one_text['latinFont']['TypeName']
                                    info_one['latinFont_fontFamily'] = one_text['latinFont']['fontFamily']
                                    info_one['font-name'] = one_text['latinFont']['fontFamily']

                                else:
                                    info_one['latinFont_AssemblyName'] = ''
                                    info_one['latinFont_TypeName'] = ''
                                    info_one['latinFont_fontFamily'] = ''
                                    info_one['font-name'] = ''

                                infos.append(info_one)
            text_one['Infos'] = infos
            text_one['uniqueId'] = item['uniqueId']
            json_map['Texts'].append(text_one)

        elif 'imagePath' in item.keys():
            shape_one = {}
            rectangle = item['rectangle']
            point = rectangle['point']
            size = rectangle['size']
            shape_one['x'] = str(float(point['X']) / rate)
            shape_one['y'] = str(float(point['Y']) / rate)
            shape_one['height'] = str(float(size['height']) / rate)
            shape_one['width'] = str(float(size['width']) / rate)
            shape_one['name'] = item['itemName']
            shape_one['class'] = 'GraphicObjectShape'
            imagePath = item['imagePath']
            image_name = imagePath.split('\\')[-1].strip()
            shape_one['BinName'] = [image_name]
            shape_one['uniqueId'] = item['uniqueId']
            json_map['Shapes'].append(shape_one)
        elif item_type == 'Shape':
            shape_one = {}
            rectangle = item['rectangle']
            point = rectangle['point']
            size = rectangle['size']
            shape_one['x'] = str(float(point['X']) / rate)
            shape_one['y'] = str(float(point['Y']) / rate)
            shape_one['height'] = str(float(size['height']) / rate)
            shape_one['width'] = str(float(size['width']) / rate)
            shape_one['name'] = item['itemName']
            shape_one['class'] = 'Shape'
            shape_one['uniqueId'] = item['uniqueId']
            json_map['Shapes'].append(shape_one)



    return json_map


get_settle_json(file_json)

{'width': '33.867',
 'height': '19.04999898438505',
 'Texts': [{'x': '18.866999049696904',
   'y': '7.654000314163557',
   'height': '3.0769999331777442',
   'width': '9.382999388651884',
   'name': '文本框 17',
   'class': 'TextShape',
   'Infos': [{'language': 'zh-CN',
     'font-size': '18',
     'font-size-asian': '18',
     'bold': 'True',
     'font-weight': 'True',
     'alternativeLanguage': 'en-US',
     'textUnderline': 'None',
     'textStrike': 'NoStrike',
     'capital': 'None',
     'kumimoji': 'False',
     'italic': 'False',
     'content': 'CLOUD',
     'spacing': '0',
     'baseLine': '0',
     'font-style': '',
     'esFont_AssemblyName': '',
     'esFont_TypeName': '',
     'esFont_fontFamily': '',
     'font-name-asian': '',
     'latinFont_AssemblyName': 'UDMPlugin',
     'latinFont_TypeName': 'UDMPlugin.UDMFont',
     'latinFont_fontFamily': '微软雅黑',
     'font-name': '微软雅黑'}],
   'uniqueId': '5d9414b0-2c55-441c-aba2-8d15f9d29258'},
  {'x': '19.280997860958667',
   '