In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer

## Идея
Поскольку doc2vec не помог(см второй ipynb), я сделала улучшенную версию c линейной регрессией. 
Тут вместо HashingVectorizer используется Tfidf с размерностью 1000 (еще пробовались 1500 и 2000, но они дали результаты хуже) и вместо первых 30000 урлов используются все, поэтому из-за большего количества данных увеличивается скор.

## Подготовка данных (urls)

In [2]:
urls_train_df = pd.read_csv('kaggle_data/url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [3]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [4]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [5]:
urls_train_df.head()

Unnamed: 0,urls,id
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01


In [6]:
age_train_df = pd.read_csv('kaggle_data/age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [7]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [8]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [9]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


## Снижение размерности

In [14]:
X, y = train_df.urls.values, train_df.age.values
print(X)

[['id.rambler.ru', 'mail.rambler.ru', 'r0.ru']
 ['1prime.ru', 'autorambler.ru', 'chellak.ru', 'docs.cntd.ru', 'echo.msk.ru', 'expert.ru', 'finance.rambler.ru', 'forbes.ru', 'forum.ixbt.com', 'garant.ru', 'govoritmoskva.ru', 'kommersant.ru', 'kp.ru', 'lenta.ru', 'mait.ru', 'metronews.ru', 'mk.ru', 'news.rambler.ru', 'news.smi2.ru', 'norm-load.ru', 'pfr.kirov.ru', 'pfrf.ru', 'photography-on-the.ru', 'realty.rambler.ru', 'ren.tv', 'riafan.ru', 'rns.online', 'rossbanki.ru', 'secretmag.ru', 'tehnorma.ru', 'tiu.ru', 'top68.ru', 'tvc.ru', 'tvzvezda.ru', 'vesti.ru', 'video.rambler.ru', 'weekend.rambler.ru']
 ['bosch-korolev.ru'] ..., ['blog.partisani.ge', 'li.ru', 'tvrain.ru']
 ['doctorkirov.ru', 'drive.ru', 'extrim-park43.ru', 'm.regions.pulset.ru', 'mail-pda.rambler.ru', 'reso.ru', 'sberbank.ru']
 ['samara.drom.ru']]


In [15]:
X = np.array(list(map(lambda x: ' '.join(x), X)))

In [16]:
hw = TfidfVectorizer(max_features=1000,token_pattern=r'[^\s]+').fit(X)
X = hw.transform(X).todense()

## Обучение модели

In [17]:
reg = LinearRegression()
- cross_val_score(reg, X, y, scoring='mean_squared_error')

array([ 141.32086246,  143.85034492,  125.65129757])

## Отправка Решения

In [18]:
reg = LinearRegression()
reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
urls_test_df = pd.read_csv('kaggle_data/url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [21]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [22]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [23]:
X = urls_test_df.urls.values
X = map(lambda x: ' '.join(x), X)
X = hw.transform(X).todense()

In [24]:
y_pred = reg.predict(X)

In [25]:
y_pred

array([ 46.97975861,  40.39896017,  35.88288362, ...,  33.52078877,
        36.24274797,  38.37330992])

In [26]:
urls_test_df['age'] = y_pred

In [27]:
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [28]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,46.979759
1,000000014A10EA183BF8594A0B2AB201,40.39896
2,000000014A4FE5C33A929D4C26943601,35.882884
3,000000014B7BB9957784A9BC0AC9F401,35.557749
4,000000014C7749F896D82C2B01E8B801,34.290456


In [32]:
random_sol = pd.read_csv('random_solution.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(list(zip(list(miss_idx), np.ones(len(miss_idx)))))
miss_df.columns = ['Id', 'age']

In [33]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [34]:
urls_test_df.to_csv('solution.csv', index=False)

In [35]:
!wc -l solution.csv

   19980 solution.csv
