In [1]:
import numpy as np
import pandas as pd
import tqdm

import catboost as cat
from catboost import CatBoostClassifier

## Загрузим данные

In [2]:
%%time
edges = pd.read_csv('./edges.csv')
ids = pd.read_csv('./ids.csv')
vertices = pd.read_csv('./vertices.csv')

CPU times: user 2.85 s, sys: 160 ms, total: 3.01 s
Wall time: 1.35 s


In [3]:
vertices['main_okved'] = vertices['main_okved'].astype(str)

In [4]:
np.random.seed(3333)

In [32]:
counts = edges.id_1.append(edges.id_2).reset_index(drop=True).value_counts()
counts = pd.DataFrame(counts.rename("id"))
merges = counts.merge(ids, left_index=True, right_on="id", how="left", indicator=True)
merges.index = merges.id
counts = counts[merges._merge == "both"]
counts /= counts.sum() / 100
counts

Unnamed: 0,id
524354,3.529384
61537,2.668124
1142564,2.292733
300432,2.030198
58408,1.932786
...,...
1392199,0.605852
912470,0.605852
1526265,0.604664
1523148,0.602288


In [56]:
counts_div = np.ceil(counts * 1000).astype("int")
ceil_sum = counts_div.sum()
v = ids.id.iloc[0]
counts_div.id.loc[v] = int(counts_div.id.loc[v] + 100000 - ceil_sum)
counts_div, counts_div.sum()

(           id
 524354   3484
 61537    2669
 1142564  2293
 300432   2031
 58408    1933
 ...       ...
 1392199   606
 912470    606
 1526265   605
 1523148   603
 373737    600
 
 [100 rows x 1 columns], id    100000
 dtype: int64)

## Обучим модель

In [57]:
result = pd.DataFrame(columns=['id_1', 'id_2'])

In [61]:
# для каждой вершины из ids с помощью catboost найдем 1000 самых вероятных ребер
for i, (v,) in tqdm.tqdm(counts_div.iterrows()):
    # соберем датасет из всех возможных вершин
    # вершины имеющие в исходных данных ребро с i обозначим 1, остальные 0
    # учтем то, что вершина i может быть как среди id_1, так и среди id_2
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
    
    
    X = df[['main_okved', 'region_code', 'company_type']]
    y = df['target']
    
    model = CatBoostClassifier(iterations=500, task_type="GPU", gpu_ram_part=0.5, random_seed=3333, verbose=False)
    cat_features = [0,1,2] # все признаки категориальные
    
    model.fit(X, y, cat_features)

    preds = model.predict_proba(X)[:,1]

    df['preds'] = preds
    df['id_2'] = i
    
    # возьмем первую 1000 предсказанных ребер, исключив те, про которые мы уже знали
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).iloc[:v].reset_index()[['id', 'id_2']]
    res.columns = ['id_1', 'id_2']
    
    result = result.append(res, ignore_index=True, sort=False)











































































































































































































100it [1:00:40, 36.41s/it][A[A


## Результат готов к отправке

In [62]:
result.to_csv('submission.csv', index = False)

In [64]:
result.drop_duplicates(inplace=False)

Unnamed: 0,id_1,id_2
0,622334,524354
1,160171,524354
2,230859,524354
3,474353,524354
4,54938,524354
...,...,...
103479,1060892,373737
103480,944705,373737
103481,859978,373737
103482,867749,373737


In [65]:
result

Unnamed: 0,id_1,id_2
0,622334,524354
1,160171,524354
2,230859,524354
3,474353,524354
4,54938,524354
...,...,...
103479,1060892,373737
103480,944705,373737
103481,859978,373737
103482,867749,373737


In [69]:
result.drop_duplicates(inplace=False)[:100000].to_csv('submission.csv', index = False)