In [1]:
# Used to read the Parquet data
import pandas as pandas
from os import walk
import pyarrow.parquet as parquet

# Used to train document embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Used to train the baseline model
from sklearn.linear_model import LogisticRegression
import numpy as numpy

# Where the downloaded data are
input_path = './'
# Where to store results
output_path = './'

In [2]:
# Get the test texts
test_texts = parquet.read_table(input_path + '/texts/textsTest/', \
                                columns = ['objectId','preprocessed']).to_pandas()
test_texts.head(10)

  labels, = index.labels


Unnamed: 0,objectId,preprocessed
0,517288,"[квартирник, нтв, маргулис, групп, пилот]"
1,9501964,"[родител, очен, трогательн, песн, артур, халат]"
2,23007371,"[сух, суперджет, откажет, западн, комплект, те..."
3,38353886,"[сгорел, сара, гор, хат]"
4,21192138,"[живодёр, отруб, лап, собак]"
5,26415073,"[ажурн, маков, кулич, так, нежн, воздушн, мяки..."
6,36734526,"[друг, уснул, бар, теб, нужн, тащ, дом, очен, ..."
7,8699823,"[никифор, ден, дат, год, март, понедельник, др..."
8,12236843,[]
9,38393782,"[днр, лнр, новост, войск, берут, кольц, донбас..."


In [3]:
# Build document embeddings for text documents
doc2vec = Doc2Vec(\
                [TaggedDocument(lines,'tag') for lines in test_texts.preprocessed], \
                vector_size=5, window=2, min_count=1, workers=4)

In [4]:
# Read a single day to train model on as Pandas dataframe
data = parquet.read_table(input_path + '/textsTrain/date=2018-02-07', \
                          columns = ['instanceId_objectId','feedback']).to_pandas()
data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
data['label'] = data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0).values
data = data[['objectId','label']]
data.head(10)

Unnamed: 0,objectId,label
0,22429313,0.0
1,14676953,0.0
2,11562101,0.0
3,20892119,0.0
4,16063005,0.0
5,14576490,0.0
6,11811947,0.0
7,1936012,1.0
8,20867189,0.0
9,28967185,0.0


In [5]:
parts = []

# Get unique object ids
ids = data.groupby('objectId').count()

# In order to save memory iterate part by part
for (dirpath, dirnames, filenames) in walk(input_path + '/texts/textsTrain/'):
    for name in filenames:
        if name.startswith('part'):
            # Read single part
            texts = parquet.read_table(input_path + '/texts/textsTrain/' + name, \
                                       columns = ['objectId','preprocessed']).to_pandas()            
            # Filter documents we need
            joined = ids.join(texts.set_index('objectId'), how='inner', on = 'objectId')
            # Evaluate embeddings
            joined['embedding'] = joined.preprocessed.apply(doc2vec.infer_vector)
            # Memorize
            parts.append(joined[['embedding']])
            print('Done with ' + name)

Done with part-00000-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00001-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00002-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00003-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00004-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00005-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00006-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00007-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00008-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00009-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00010-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00011-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00012-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00013-1b50c8f5-87db-4a5

In [6]:
# Combine all the parts
train = data.join(pandas.concat(parts), on = 'objectId')
train.head(10)

Unnamed: 0,objectId,label,embedding
0,22429313,0.0,"[1.7023556, -0.61280745, -2.3916838, 3.7633474..."
1,14676953,0.0,"[1.6586152, -0.36108485, -1.9833397, 2.0739386..."
2,11562101,0.0,"[0.15386434, -0.33638588, -0.5353734, 0.122175..."
3,20892119,0.0,"[0.9824828, -0.5868671, -0.98727924, 1.5039121..."
4,16063005,0.0,"[0.99467623, -0.548113, -3.524225, 3.7854316, ..."
5,14576490,0.0,"[0.021854596, -0.40157402, -0.023478763, 0.198..."
6,11811947,0.0,"[0.6359992, -0.75050163, -1.2051268, 0.7758712..."
7,1936012,1.0,"[-0.11006393, -0.64717454, -0.58839494, 0.2012..."
8,20867189,0.0,"[1.5456027, 0.36930153, -2.1795776, 2.1914325,..."
9,28967185,0.0,"[0.3342202, 0.13672762, -0.16420954, 0.2752888..."


In [7]:
# Construct the label (liked objects)
y = train['label'].values

# Extract the most interesting features
X = numpy.stack(train['embedding'].values)

In [8]:
# Fit the model and check the weights
model = LogisticRegression(random_state=0, solver='lbfgs').fit(X, y)
model.coef_

array([[-0.26446775, -0.09334847,  0.06236364, -0.00363809,  0.18652154]])

In [9]:
# Weight the test documents
test_texts['weight'] = -model.predict_proba(numpy.stack(\
    test_texts.preprocessed.apply(lambda x : doc2vec.infer_vector(x))))[:, 1]
test_texts.head(10)

Unnamed: 0,objectId,preprocessed,weight
0,517288,"[квартирник, нтв, маргулис, групп, пилот]",-0.193637
1,9501964,"[родител, очен, трогательн, песн, артур, халат]",-0.181544
2,23007371,"[сух, суперджет, откажет, западн, комплект, те...",-0.170146
3,38353886,"[сгорел, сара, гор, хат]",-0.170694
4,21192138,"[живодёр, отруб, лап, собак]",-0.168873
5,26415073,"[ажурн, маков, кулич, так, нежн, воздушн, мяки...",-0.118487
6,36734526,"[друг, уснул, бар, теб, нужн, тащ, дом, очен, ...",-0.172113
7,8699823,"[никифор, ден, дат, год, март, понедельник, др...",-0.181164
8,12236843,[],-0.179489
9,38393782,"[днр, лнр, новост, войск, берут, кольц, донбас...",-0.181132


In [10]:
# Read the test data
test = parquet.read_table(input_path + '/textsTest', \
    columns = ['instanceId_userId','instanceId_objectId']).to_pandas()
test.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
test.head(10)

  labels, = index.labels


Unnamed: 0,instanceId_userId,objectId
0,1006,34577503
1,1006,37520199
2,1618,546086
3,1618,546086
4,1618,546086
5,1618,35981492
6,1618,26764305
7,1810,958605
8,1810,20479574
9,1810,36254478


In [11]:
# Join test documents and elliminate possible duplicates
scores = test.join(\
    test_texts[['objectId','weight']].set_index('objectId'), \
    how = 'inner', \
    on = 'objectId') \
    .groupby(['instanceId_userId','objectId']).min()
scores.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
instanceId_userId,objectId,Unnamed: 2_level_1
316,17997084,-0.128333
316,37758420,-0.17275
631,15478935,-0.156306
631,30513650,-0.154663
631,38118098,-0.163259
742,10672856,-0.137634
742,24302446,-0.158201
742,28816291,-0.173662
742,34685448,-0.151551
868,11640701,-0.162864


In [12]:
#  Sort for each user
result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
result.head(10)

Unnamed: 0,instanceId_userId,objectId,weight
0,316,37758420,-0.17275
1,316,17997084,-0.128333
2,631,38118098,-0.163259
3,631,15478935,-0.156306
4,631,30513650,-0.154663
5,742,28816291,-0.173662
6,742,24302446,-0.158201
7,742,34685448,-0.151551
8,742,10672856,-0.137634
9,868,29193052,-0.197144


In [13]:
# Collect predictions for each user
submit = result.groupby("instanceId_userId")['objectId'].apply(list)
submit.head(10)

instanceId_userId
316                                  [37758420, 17997084]
631                        [38118098, 15478935, 30513650]
742              [28816291, 24302446, 34685448, 10672856]
868     [29193052, 22115500, 29650308, 30882080, 35655...
979                                   [37950972, 7996257]
1006                                 [37520199, 34577503]
1276                       [36856262, 22812401, 31000576]
1444                                 [36806487, 20963755]
1483                                 [34991228, 38036543]
1618                         [546086, 26764305, 35981492]
Name: objectId, dtype: object

In [14]:
# Persist the first submit
submit.to_csv(output_path + "/textSubmit.csv.gz", header = False, compression='gzip')