# L. Wi-Fi

In [18]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import GroupShuffleSplit
from collections import defaultdict
from unidecode import unidecode

In [2]:
# calculates distance between 2 points
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

# computes how many ngrams string s1 ans s2 have in common
def compare_ngrams(s1, s2, n):
    ngrams1 = [s1[i:i + n] for i in range(len(s1) - n)]
    ngrams2 = [s2[i:i + n] for i in range(len(s2) - n)]
    count = 0
    for ngram1 in ngrams1:
        for ngram2 in ngrams2:
            if ngram1 == ngram2:
                count += 1
    return count / max(len(s1), len(s2))

def compute_score(y_hat, y_true):
    groups = df.iloc[train_idx].group_num if y_true.params['train'] else df.iloc[test_idx].group_num
    groups = groups.values
    targets = y_true.get_label().values
    max_pred = defaultdict(lambda: (-1, -1))
    n = len(y_hat)
    for i in range(n):
        if max_pred[groups[i]][0] < y_hat[i]:
            max_pred[groups[i]] = (y_hat[i], i)
    acc = 0
    for _, i in max_pred.values():
        acc += targets[i]
    acc /= len(max_pred)
    return 'score', 1 + 6 * (acc - 0.5), True

In [3]:
df_train = pd.read_csv('input/l_train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('input/l_test.tsv', sep='\t', index_col=0)
df_train.head()

Unnamed: 0,address,candidate_num,group_num,has_wifi,lat,lon,names,publishing_status,rubrics,ssid,target,urls,user_lat,user_lon
0,"Россия, Новосибирская область, Обь, проспект М...",0,0,,55.007759,82.667023,"[""Сбербанк России, банкомат"", ""Sberbank Rossii...",publish,[30336],Tolmachevo-MTS-Free,0,"[{""type"": ""main"", ""value"": ""http://www.sberban...",55.007579,82.666723
1,"Россия, Новосибирская область, аэропорт Новоси...",1,0,,55.007879,82.665401,"[""Ил-86""]",publish,[3481524327],Tolmachevo-MTS-Free,0,[],55.007579,82.666723
2,"Россия, Новосибирская область, Обь, проспект М...",2,0,,55.00757,82.667069,"[""Чашка кофе"", ""Чашка Кофе""]",publish,[31495],Tolmachevo-MTS-Free,0,"[{""type"": ""mining"", ""value"": ""https://2gis.ru/...",55.007579,82.666723
3,"Россия, Новосибирская область, Обь, проспект М...",3,0,True,55.007337,82.667066,"[""Чашка кофе"", ""Chashka kofe"", ""Чашка кофе"", ""...",publish,[31495],Tolmachevo-MTS-Free,0,"[{""type"": ""main"", ""value"": ""http://chashkacoff...",55.007579,82.666723
4,"Россия, Новосибирская область, Обь, проспект М...",4,0,,55.007623,82.666341,"[""Телефон доверия""]",obsolete,[30078],Tolmachevo-MTS-Free,0,"[{""type"": ""mining"", ""value"": ""https://2gis.ru/...",55.007579,82.666723


Let's filter rows with **target==1** and look at **ssid** and **names**

In [4]:
df_train[df_train.target == 1][['names', 'ssid']].head()

Unnamed: 0,names,ssid
18,"[""Аэропорт Толмачево, бухгалтерия"", ""Толмачево""]",Tolmachevo-MTS-Free
38,"[""Kontrolmatik""]",Kontrolmatik_Staff
49,"[""ПКВ Моторс"", ""Pkw Motors"", ""Pkw Motors"", ""Те...",PKW Guests
77,"[""Техцентр Юста"", ""Tekhtsentr Yusta"", ""Юста"", ...",YUSTA
94,"[""Респект Авто"", ""Автосервис""]",RespectAuto


You might notice, that almost always they have a lot in common. Though sometimes they are written in different
languages/cases. A simple heuristic to meausure how much strings have in common would be to transliterate
cyrillyc and accented letters and to calculate how many char ngrams they have in common for different **n**. I used **n=1..8**. That are very strong features.

Let's compute features. It might take up to 7 minutes and several GB of RAM. The longset part is one-hot encoding rubrics and calculating number of matching ngrams.

In [5]:
%%time
df = pd.concat((df_train, df_test))
df['test'] = df.target.isnull()
df['publishing_status'] = df['publishing_status'].astype('category')
df['has_wifi'] = df['has_wifi'].astype(float)
df['distance_km'] = haversine_np(df.lon, df.lat, df.user_lon, df.user_lat)
df['distance_lat'] = df.user_lat - df.lat
df['distance_lon'] = df.user_lon - df.lon
df['abs_distance_lat'] = abs(df.user_lat - df.lat)
df['abs_distance_lon'] = abs(df.user_lon - df.lon)
df = pd.concat((df, df.rubrics.str.replace('[\[\]\s]', '').str.get_dummies(',').add_prefix('rubric_')), axis=1)
for i in range(1, 9):
    df[f'ngrams_match_names_{i}'] = df.apply(lambda x: compare_ngrams(unidecode(x.names.lower()), unidecode(x.ssid.lower()), i), axis=1)
    df[f'ngrams_match_urls_{i}'] = df.apply(lambda x: compare_ngrams(unidecode(x.urls.lower()), unidecode(x.ssid.lower()), i), axis=1)

print(df.shape)

(109016, 1210)
CPU times: user 6min 24s, sys: 27.3 s, total: 6min 51s
Wall time: 6min 52s


We should use group-aware validation, so I used `GroupShuffleSplit` which will place all samples from one group
to either train or test set, so that test and train sets will have non-intersecting groups.

In [7]:
columns = ['lon', 'lat', 'has_wifi', 'user_lon', 'user_lat', 'publishing_status', 'distance_km',
          'distance_lat', 'distance_lon', 'abs_distance_lat', 'abs_distance_lon'] + \
          [c for c in df.columns if c.startswith('rubric_')] + \
          [c for c in df.columns if c.startswith('ngrams_match_')]
train_idx, test_idx = next(GroupShuffleSplit(1, test_size=0.20, random_state=2)
                           .split(df[~df.test].index, groups=df[~df.test].group_num))
train_ds = lgb.Dataset(df.iloc[train_idx][columns],
                       df.iloc[train_idx].target, params={'train': 1})
valid_ds = lgb.Dataset(df.iloc[test_idx][columns],
                       df.iloc[test_idx].target, params={'train': 0})

Let's train!

In [9]:
params = { 'objective': 'binary' }
model = lgb.train(params, train_ds, valid_sets=[train_ds, valid_ds], verbose_eval=20, num_boost_round=300, feval=compute_score)

[20]	training's binary_logloss: 0.110857	training's score: 3.54714	valid_1's binary_logloss: 0.112376	valid_1's score: 3.48571
[40]	training's binary_logloss: 0.0525294	training's score: 3.63571	valid_1's binary_logloss: 0.0575634	valid_1's score: 3.54286
[60]	training's binary_logloss: 0.0402884	training's score: 3.72714	valid_1's binary_logloss: 0.0493212	valid_1's score: 3.58286
[80]	training's binary_logloss: 0.0344945	training's score: 3.77429	valid_1's binary_logloss: 0.0472072	valid_1's score: 3.58286
[100]	training's binary_logloss: 0.0304829	training's score: 3.82571	valid_1's binary_logloss: 0.0463274	valid_1's score: 3.59429
[120]	training's binary_logloss: 0.0276586	training's score: 3.85429	valid_1's binary_logloss: 0.0456935	valid_1's score: 3.6
[140]	training's binary_logloss: 0.0251468	training's score: 3.89	valid_1's binary_logloss: 0.0454254	valid_1's score: 3.61714
[160]	training's binary_logloss: 0.0228951	training's score: 3.90714	valid_1's binary_logloss: 0.045294

Okay, we see that validation score stops increasing quite soon after **140** iterations reaching **3.61** out of 4 which is not so bad 😎

Now, when we know how much rounds we need, we can feed the whole dataset.

In [14]:
train_ds = lgb.Dataset(df[~df.test][columns], df[~df.test].target, params={'train': 1})
params = { 'objective': 'binary' }
train_idx = df[~df.test].index

In [15]:
model = lgb.train(params, train_ds, valid_sets=train_ds, verbose_eval=10, num_boost_round=170, feval=compute_score)

[10]	training's binary_logloss: 0.232019	training's score: 3.43657
[20]	training's binary_logloss: 0.111026	training's score: 3.54971
[30]	training's binary_logloss: 0.0690424	training's score: 3.61486
[40]	training's binary_logloss: 0.0528524	training's score: 3.64114
[50]	training's binary_logloss: 0.0454717	training's score: 3.67543
[60]	training's binary_logloss: 0.0413087	training's score: 3.70514
[70]	training's binary_logloss: 0.0382407	training's score: 3.73829
[80]	training's binary_logloss: 0.036007	training's score: 3.74857
[90]	training's binary_logloss: 0.0341142	training's score: 3.77143
[100]	training's binary_logloss: 0.0324071	training's score: 3.79543
[110]	training's binary_logloss: 0.0310263	training's score: 3.80686
[120]	training's binary_logloss: 0.0297982	training's score: 3.824
[130]	training's binary_logloss: 0.0286451	training's score: 3.83657
[140]	training's binary_logloss: 0.0276137	training's score: 3.84343
[150]	training's binary_logloss: 0.0266454	train

Predict, save and submit! 🎉

In [17]:
predictions = model.predict(df[df.test][columns])
predictions = pd.concat((df[df.test], pd.Series(predictions, name='pred')), axis=1)[['group_num', 'pred']]
predictions['target'] = 0
predictions.loc[predictions[['group_num', 'pred']].groupby('group_num').idxmax().values[:, 0], 'target'] = 1
predictions.target.to_csv('output/l.out', index=False)