In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer

## Подготовка данных (urls)

In [2]:
urls_train_df = pd.read_csv('url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [3]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [4]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [5]:
urls_train_df.head()

Unnamed: 0,urls,id
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01


In [6]:
age_train_df = pd.read_csv('age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [7]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [8]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [9]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


## Снижение размерности

In [10]:
topk = 30000
X, y = train_df.urls.values[:topk], train_df.age.values[:topk]

In [11]:
X = map(lambda x: ' '.join(x), X)
hw = HashingVectorizer(n_features=1000).fit(X)
X = hw.transform(X).todense()

## Обучение модели

In [12]:
reg = LinearRegression()
- cross_val_score(reg, X, y, scoring='mean_squared_error')

array([ 155.25232545,  144.37600499,  158.57268954])

## Отправка Решения

In [13]:
reg = LinearRegression()
reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
urls_test_df = pd.read_csv('url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [15]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [16]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [17]:
X = urls_test_df.urls.values
X = map(lambda x: ' '.join(x), X)
X = hw.transform(X).todense()

In [18]:
y_pred = reg.predict(X)

In [19]:
y_pred

array([ 42.68803957,  39.76833715,  38.71313275, ...,  35.32366851,
        37.30954375,  49.12761871])

In [20]:
urls_test_df['age'] = y_pred

In [21]:
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [22]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,42.68804
1,000000014A10EA183BF8594A0B2AB201,39.768337
2,000000014A4FE5C33A929D4C26943601,38.713133
3,000000014B7BB9957784A9BC0AC9F401,32.49378
4,000000014C7749F896D82C2B01E8B801,33.155544


In [29]:
random_sol = pd.read_csv('random_solution.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(list(zip(list(miss_idx), np.ones(len(miss_idx)))))
miss_df.columns = ['Id', 'age']

In [30]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [31]:
urls_test_df.to_csv('solution.csv', index=False)

In [32]:
!wc -l solution.csv

   19980 solution.csv
