In [1]:
import pandas as pd

In [2]:
inv = pd.read_csv("../data/opendata/samples/inventions_sample.csv")
mod = pd.read_csv("../data/opendata/samples/models_sample.csv")
des = pd.read_csv("../data/opendata/samples/designs_sample.csv")

In [32]:
inv_res = pd.read_excel("inv_test_result_v2.xlsx", index_col=0, dtype=str)
mod_res = pd.read_excel("mod_test_result_v2.xlsx", index_col=0, dtype=str)
des_res = pd.read_excel("des_test_result_v2.xlsx", index_col=0, dtype=str)

In [33]:
mod_res.head(2)

Unnamed: 0,patent_number,cor_address,name,tax_number,name_from_patent,individual,country,has_tn
0,120365,"660133, г.Красноярск, ул. Авиаторов, 1, стр.1,...","ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АРНИКА""",2460018787,"Общество с ограниченной ответственностью ""Арника""",False,RU,True
1,32373,"198328, Санкт-Петербург, пр-т маршала Захарова...",БЕЛЯЕВ АЛЕКСАНДР ГЕННАДЬЕВИЧ,780708385189,Беляев Александр Геннадьевич,True,RU,True


In [38]:
def prepare_part(d, d_with_res, kind):
    d_with_res["patent_number"] = d_with_res["patent_number"].astype(int)
    df = d_with_res.merge(
        d, how="left", left_on="patent_number", right_on="registration number"
    )
    df["reg_number"] = df["patent_number"]
    df["reg_date"] = pd.to_datetime(df["registration date"], format="%Y%m%d")
    df["appl_date"] = pd.to_datetime(df["application date"], format="%Y%m%d")
    df["author_raw"] = df["authors"].str[:100]
    df["owner_raw"] = df["patent holders"].str[:100]
    df["address"] = df["cor_address"]
    df["full_name"] = df["name"].str[:100]
    df["name"] = df[[c for c in df.columns if " name" in c][0]]
    df["category"] = "A"
    df["subcategory"] = "A"
    df["kind"] = kind
    df["region"] = "Moscow"
    df["city"] = "Moscow"
    df["author_count"] = 1
    df["active"] = True
    
    return df[[
        "reg_number", "reg_date", "appl_date",
        "author_raw", "owner_raw", "address",
        "name", "actual", "category", "subcategory", "kind",
        "region", "city", "author_count", 
        "individual", "full_name", "tax_number", "active"
    ]]

In [39]:
joint = pd.concat((
    prepare_part(inv, inv_res, 1),
    prepare_part(mod, mod_res, 2),
    prepare_part(des, des_res, 3),
))

In [40]:
joint.tail(2)

Unnamed: 0,reg_number,reg_date,appl_date,author_raw,owner_raw,address,name,actual,category,subcategory,kind,region,city,author_count,individual,full_name,tax_number,active
1065,78299,2011-05-16,2010-03-22,Желез Наталья Ильинична (RU)\r\nЖелез Сергей Б...,Желез Наталья Ильинична (RU),"440058, г. Пенза, ул. Тепличная, 8, кв. 205, Н...",ДИВАН УГЛОВОЙ СЕКЦИОННЫЙ,False,A,A,3,Moscow,Moscow,1,True,ЖЕЛЕЗ НАТАЛЬЯ ИЛЬИНИЧНА,583407642001,True
1066,59842,2006-08-16,2005-03-21,Черногорцев Дмитрий Викторович (RU)\r\nБурнашо...,"Общество с ограниченной ответственностью ""Бабк...",,УПАКОВКА ДЛЯ СЕМЕЧЕК (два варианта),False,A,A,3,Moscow,Moscow,1,False,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ТОРГ...",7725776379,True


In [41]:
reg_numbers = joint["reg_number"].unique()
reg_number_ids = pd.DataFrame(dict(reg_number=reg_numbers, patent_id=range(1, len(reg_numbers) + 1)))
reg_number_ids.head(2)

Unnamed: 0,reg_number,patent_id
0,2137261,1
1,2631279,2


In [42]:
tax_numbers = joint["tax_number"].unique()
tax_number_ids = pd.DataFrame(dict(tax_number=tax_numbers, person_id=range(1, len(tax_numbers) + 1)))
tax_number_ids.head(2)

Unnamed: 0,tax_number,person_id
0,772637744816.0,1
1,,2


In [43]:
full = joint.merge(reg_number_ids).merge(tax_number_ids)
full.head(2)

Unnamed: 0,reg_number,reg_date,appl_date,author_raw,owner_raw,address,name,actual,category,subcategory,kind,region,city,author_count,individual,full_name,tax_number,active,patent_id,person_id
0,2137261,1999-09-10,1998-08-04,Демидов Юрий Михайлович,Демидов Юрий \n\nМихайлович,"127560, Москва, ул.Коненкова, 5, кв.16, Демидо...",АКТИВНЫЙ МАТЕРИАЛ ЭЛЕКТРОДА ХИМИЧЕСКОГО ИСТОЧН...,False,A,A,1,Moscow,Moscow,1,True,ДЕМИДОВ ЮРИЙ МИХАЙЛОВИЧ,772637744816.0,True,1,1
1,2631279,2017-09-20,2016-03-18,Кочетов Олег Савельевич (RU),Кочетов Олег Савельевич (RU),"141191, Московская обл., г. Фрязино, ул. Горьк...",ШИРОКОФАКЕЛЬНАЯ ЦЕНТРОБЕЖНАЯ ФОРСУНКА,True,A,A,1,Moscow,Moscow,1,True,Кочетов Олег Савельевич,,True,2,2


In [76]:
patents = full[[
    "reg_number", "reg_date", "appl_date",
    "author_raw", "owner_raw", "address",
    "name", "actual", "category", "subcategory", "kind",
    "region", "city", "author_count",
    "patent_id"
]].drop_duplicates(subset=["kind", "patent_id"])
patents.drop(columns=["patent_id"], inplace=True)
patents.head(2)

Unnamed: 0,reg_number,reg_date,appl_date,author_raw,owner_raw,address,name,actual,category,subcategory,kind,region,city,author_count
0,2137261,1999-09-10,1998-08-04,Демидов Юрий Михайлович,Демидов Юрий \n\nМихайлович,"127560, Москва, ул.Коненкова, 5, кв.16, Демидо...",АКТИВНЫЙ МАТЕРИАЛ ЭЛЕКТРОДА ХИМИЧЕСКОГО ИСТОЧН...,False,A,A,1,Moscow,Moscow,1
1,2631279,2017-09-20,2016-03-18,Кочетов Олег Савельевич (RU),Кочетов Олег Савельевич (RU),"141191, Московская обл., г. Фрязино, ул. Горьк...",ШИРОКОФАКЕЛЬНАЯ ЦЕНТРОБЕЖНАЯ ФОРСУНКА,True,A,A,1,Moscow,Moscow,1


In [77]:
persons = full[[
    "individual", "tax_number", "full_name", "active", "person_id"
]].drop_duplicates("person_id")
persons.rename(columns={"individual": "kind"}, inplace=True)
persons.drop(columns=["person_id"], inplace=True)
persons["kind"] = persons["kind"].astype(bool).astype(int)
persons.dropna(subset=["tax_number"], inplace=True)
persons.head(2)

Unnamed: 0,kind,tax_number,full_name,active
0,1,772637744816,ДЕМИДОВ ЮРИЙ МИХАЙЛОВИЧ,True
1355,1,7728016351,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ УЧРЕЖДЕН...,True


In [78]:
ownership = full[["kind", "reg_number", "tax_number"]]
ownership.dropna(subset=["tax_number"], inplace=True)
ownership.drop_duplicates(inplace=True)
ownership.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ownership.dropna(subset=["tax_number"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ownership.drop_duplicates(inplace=True)


Unnamed: 0,kind,reg_number,tax_number
0,1,2137261,772637744816
1355,1,2731963,7728016351


In [80]:
patents.to_csv("patents-demo-v2.csv", index=False)
persons.to_csv("persons-demo-v2.csv", index=False)
ownership.to_csv("ownership-demo-v2.csv", index=False)