In [1]:
import numpy as np
import pandas as pd
import tqdm

import catboost as cat
from catboost import CatBoostClassifier

## Загрузим данные

In [2]:
%%time
edges = pd.read_csv('./edges.csv')
ids = pd.read_csv('./ids.csv')
vertices = pd.read_csv('./vertices.csv')

CPU times: user 2.79 s, sys: 201 ms, total: 2.99 s
Wall time: 1.33 s


In [3]:
vertices['main_okved'] = vertices['main_okved'].astype(str)

In [4]:
np.random.seed(3333)

In [5]:
counts = edges.id_1.append(edges.id_2).reset_index(drop=True).value_counts()
counts = pd.DataFrame(counts.rename("id"))
merges = counts.merge(ids, left_index=True, right_on="id", how="left", indicator=True)
merges.index = merges.id
counts = counts[merges._merge == "both"]
counts /= counts.sum() / 100
counts = np.ceil(counts * 1000).astype("int")
counts

Unnamed: 0,id
524354,3530
61537,2669
1142564,2293
300432,2031
58408,1933
...,...
1392199,606
912470,606
1526265,605
1523148,603


In [6]:
def normalize(df, col, by="value"):
    dist = df.sort_values(by=[col, by])[col].reset_index()
    return (dist["index"].rename(col) + 1) / len(df)

## Обучим модель

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.regularizers import l2

model = Sequential([
    Dense(100, input_shape=(len(x.columns),)),
    Dropout(0.5),
    Activation("relu"),
    Dense(50, kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001)),
    Dropout(0.3),
    Activation("relu"),
    Dense(4, activation="softmax")
])
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
w_save = model.get_weights()

NameError: name 'Sequential' is not defined

In [8]:
result = pd.DataFrame(columns=["id_1", "id_2"])
# для каждой вершины из ids с помощью catboost найдем 1000 самых вероятных ребер
for i, (v,) in tqdm.tqdm(counts.iterrows()):
    # соберем датасет из всех возможных вершин
    # вершины имеющие в исходных данных ребро с i обозначим 1, остальные 0
    # учтем то, что вершина i может быть как среди id_1, так и среди id_2
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1', "n_transactions", "value"]].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2', "n_transactions", "value"]])
    df['target'] = 1
    df["weight"] = normalize(df, "value") * normalize(df, "n_transactions") * 2 + 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')[['target', "weight"]]).fillna({"target": 0, "weight": 1})
    
    X = df[['main_okved', 'region_code', 'company_type']]
    w = df["weight"]
    y = df['target']
    

    model = CatBoostClassifier(iterations=500, task_type="GPU", gpu_ram_part=0.8, random_seed=3333, verbose=False)
    cat_features = [0,1,2] # все признаки категориальные
    
    model.fit(X, y, cat_features)

    preds = model.predict_proba(X)[:,1]

    df['preds'] = preds
    df['id_2'] = i
    
    # возьмем первую 1000 предсказанных ребер, исключив те, про которые мы уже знали
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).iloc[:v].reset_index()[['id', 'id_2']]
    res.columns = ['id_1', 'id_2']
    
    result = result.append(res, ignore_index=True, sort=False)

100it [1:00:50, 36.50s/it]


## Результат готов к отправке

In [9]:
result.drop_duplicates(inplace=False)[:100000].to_csv('submission.csv', index=False)