In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

In [15]:
dt = pd.read_csv("data/test.csv")

In [46]:
dt[dt['t'].isna()].shape

(6247722, 7)

# Загрузка данных

В бейзлайне реализована простейшая модель на эго-графе.

Для каждой пары вершин u и v эго-графа попытаемся найти всех общих "друзей" w. Силой связи между вершинами u и v будем считать средную силу связи между ними и общими друзьями.

Поскольку обучение для такой модели не требуется, будем пользоваться только тестовой выборкой.

In [10]:
%%time

data = pd.read_csv("data/train.csv")
attributes = pd.read_csv("data/attr.csv")
submission = pd.read_csv("data/submission.csv")

CPU times: total: 49.5 s
Wall time: 49.5 s


In [71]:
attributes.head()

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
0,0,227,68,-1,1,778293348,-1
1,0,45,38,237065842,1,82803468,238500268
2,0,142,60,237065842,1,196560139,-1
3,0,280,66,-1,2,963209731,720783270
4,0,41,18,-1,2,308862409,-1


In [72]:
# generate cpp file
result = []
for j in zip(*[attributes[i][:100] for i in attributes.keys()]):
    print(j)
    result.append(j)
with open('cpp_attributes')
# for i in attributes:
    

(0, 227, 68, -1, 1, 778293348, -1)
(0, 45, 38, 237065842, 1, 82803468, 238500268)
(0, 142, 60, 237065842, 1, 196560139, -1)
(0, 280, 66, -1, 2, 963209731, 720783270)
(0, 41, 18, -1, 2, 308862409, -1)
(0, 284, 30, 515825832, 2, 523273865, 219762299)
(0, 39, 107, -1, 2, -1, -1)
(0, 204, 21, 298413605, 1, -1, 70724144)
(0, 224, 21, 298413605, 1, -1, -1)
(0, 138, 43, 238321946, 1, 551175147, 359213661)
(0, 24, 42, -1, 1, 305311702, 720787027)
(0, 271, 52, -1, 2, -1, -1)
(0, 102, 30, 515825832, 1, 249333954, 894086074)
(0, 129, 62, 237065842, 2, 217222933, -1)
(0, 188, 21, 237065842, 2, -1, 149014508)
(0, 218, 35, 237065842, 2, 532789710, 66999397)
(0, 270, 25, 720792568, 2, -1, -1)
(0, 52, 49, 238321946, 1, 829727092, 991369526)
(0, 68, 50, 237065842, 1, 682233918, 532275706)
(0, 287, 78, 237065842, 2, 497266803, -1)
(0, 209, 22, 237065842, 2, 75365905, 149014508)
(0, 51, 43, 237065842, 1, 829727092, 991369526)
(0, 30, 30, 237065842, 1, 639658522, 310894832)
(0, 144, 32, 237065842, 1, -1, 

In [60]:
def generate_cpp_files():
    cpp_attributes = attributes.copy()
    cpp_attributes
generate_cpp_files()

In [59]:
attributes[attributes['university'].isna()]

Unnamed: 0,ego_id,u,age,city_id,sex,school,university


In [23]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


**Разобьем на обучающие и тестовые данные**

In [22]:
submission[submission['ego_id'] == 0].head()

Unnamed: 0,ego_id,u,v,x1


In [5]:
sep = 854698492346
data_train, data_test = data[data['ego_id'] < sep], data[data['ego_id'] >= sep]

X_train, y_train = data_train.iloc[:, 0:-3], data_train.iloc[:, -3]
X_test, y_test = data_test.iloc[:, 0:-3], data_test.iloc[:, -3]

In [14]:
# attributes.head()
data_train['t']

0           148.0
1           396.7
2             NaN
3           594.5
4            45.5
            ...  
61517082      0.3
61517083      0.3
61517084      0.2
61517085     51.7
61517086      0.3
Name: t, Length: 61517087, dtype: float64

**Для визуализации графа**

In [19]:
def draw_graph():
    t = data[data['ego_id']==1709396984692]
    t.shape
    for i in zip(t['u'], t['v']):
        print(i[0], '-', i[1], sep='')
draw_graph()

5-1
14-3
7-5
5-7
6-5
16-11
2-15
11-16
0-12
7-8
5-8
15-4
3-8
12-9
2-3
1-4
8-4
0-2
0-3
2-4
4-15
11-12
8-11
7-6
11-0
4-5
0-1
4-16
0-9
2-8
6-11
3-4
7-1
4-2
11-8
12-8
11-9
5-4
0-6
12-2
7-11
11-6
4-11
0-7
9-10
0-18
7-3
10-0
10-11
14-6
2-6
5-16
16-5
0-11
2-12
16-7
5-6
18-0
14-7
15-5
10-9
3-6
10-8
8-12
14-0
5-0
16-15
8-5
5-3
15-0
16-6
1-6
11-4
15-3
0-15
2-14
15-2
7-4
4-7
1-8
8-1
11-7
9-0
0-10
0-8
2-5
0-14
3-0
8-0
0-4
3-5
1-5
1-7
5-11
3-1


In [24]:
submission.head(100)

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.000000
1,8,0,143,0.000000
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857
...,...,...,...,...
95,28,122,229,0.525159
96,28,123,110,0.009399
97,28,126,116,0.117795
98,28,130,92,0.070024


In [None]:
# get graph

for u, v in zip(submission_ego_net["u"], submission_ego_net["v"]):
    if 

Проверочные данные - недоступны участникам, используются для подсчёта метрики:

Таблицы control и submission отличаются только последним столбцом x1. В таблице control в этом столбце содержатся истинные значения связей x1.

In [8]:
control[["ego_id", "u", "v"]].equals(submission[["ego_id", "u", "v"]])

True

Таблица submission отсортирована по возрастанию ego_id, u, v:

In [9]:
submission.equals(submission.sort_values(["ego_id", "u", "v"]))

True

In [10]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


# Модель

In [11]:
submission["ego_id"].value_counts()

ego_id
901943132599     349
721554506143     328
1400159338751    298
1039382085802    270
575525618423     262
                ... 
1142461301166      1
515396076193       1
515396076373       1
51539608193        1
1245540516719      1
Name: count, Length: 20586, dtype: int64

Константное предсказание:

In [12]:
%%time

submission_dummy = submission.copy()
submission_dummy["x1"] = np.nanmean(test["x1"].values)

CPU times: total: 281 ms
Wall time: 290 ms


In [13]:
df = pd.DataFrame()
df['u'] = np.array([1, 2, 3, 4])
df['x1'] = np.array([10, 10, 10, 10])


df1 = pd.DataFrame()
df1['u'] = np.array([2, 3, 4])
df1['x1'] = np.array([19, 19, 19])


t = df1.merge(df, on='u')
np.mean(t.drop("u", axis=1).values, axis = None)




14.5

In [14]:
%%time

from tqdm import tqdm


ego_id_list = submission["ego_id"].drop_duplicates().values
for ego_id in tqdm(ego_id_list):
    submission_ego_net = submission[submission["ego_id"] == ego_id]
    test_ego_net = test[test["ego_id"] == ego_id]
    friendship = np.zeros_like(submission_ego_net["x1"].values)
    for i, (u, v) in enumerate(zip(submission_ego_net["u"], submission_ego_net["v"])):
        u_x1 = test_ego_net.loc[test_ego_net["u"] == u, ["v", "x1"]].dropna()
        v_x1 = test_ego_net.loc[test_ego_net["u"] == v, ["v", "x1"]].dropna()
        common_friends = u_x1.merge(v_x1, on="v")
        if common_friends.shape[0] > 0:
            friendship[i] = np.mean(common_friends.drop("v", axis=1).values)
    submission.loc[submission["ego_id"] == ego_id, "x1"] = friendship

100%|████████████████████████████████████████████████████████████████████████████| 20586/20586 [40:35<00:00,  8.45it/s]

CPU times: total: 40min 38s
Wall time: 40min 35s





# Посчёт метрик

In [15]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [16]:
print("Dummy model RMSE: {}".format(RMSE(control["x1"], submission_dummy["x1"])))

Dummy model RMSE: 0.9073352538252178


In [17]:
print("Baseline model RMSE: {}".format(RMSE(control["x1"], submission["x1"])))

Baseline model RMSE: 1.440427694215907e-07


In [18]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


In [19]:
submission.to_csv('nmbits.csv')