In [1]:
# Used to read the Parquet data
import pandas as pandas
from os import walk
import pyarrow.parquet as parquet

# Used to train document embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Used to train the baseline model
from sklearn.linear_model import LogisticRegression
import numpy as numpy

In [2]:
# Get the test texts
testTexts = parquet.read_table('./texts/textsTest/', columns = ['objectId','preprocessed']).to_pandas()
testTexts

  labels, = index.labels


Unnamed: 0,objectId,preprocessed
0,517288,"[квартирник, нтв, маргулис, групп, пилот]"
1,9501964,"[родител, очен, трогательн, песн, артур, халат]"
2,23007371,"[сух, суперджет, откажет, западн, комплект, те..."
3,38353886,"[сгорел, сара, гор, хат]"
4,21192138,"[живодёр, отруб, лап, собак]"
5,26415073,"[ажурн, маков, кулич, так, нежн, воздушн, мяки..."
6,36734526,"[друг, уснул, бар, теб, нужн, тащ, дом, очен, ..."
7,8699823,"[никифор, ден, дат, год, март, понедельник, др..."
8,12236843,[]
9,38393782,"[днр, лнр, новост, войск, берут, кольц, донбас..."


In [3]:
# Build document embeddings for text documents
doc2vec = Doc2Vec(\
                [TaggedDocument(lines,'tag') for lines in testTexts.preprocessed], \
                vector_size=5, window=2, min_count=1, workers=4)

In [4]:
# Read a single day to train model on as Pandas dataframe
data = parquet.read_table('./textsTrain/date=2018-02-07', columns = ['instanceId_objectId','feedback']).to_pandas()
data = data.rename(columns = {'instanceId_objectId':'objectId'})
data['label'] = data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0).values
data = data[['objectId','label']]
data

Unnamed: 0,objectId,label
0,22429313,0.0
1,14676953,0.0
2,11562101,0.0
3,20892119,0.0
4,16063005,0.0
5,14576490,0.0
6,11811947,0.0
7,1936012,1.0
8,20867189,0.0
9,28967185,0.0


In [5]:
parts = []

# Get unique object ids
idsOnly = data.groupby('objectId').count()

# In order to save memory iterate part by part
for (dirpath, dirnames, filenames) in walk('./texts/textsTrain/'):
    for name in filenames:
        if name.startswith('part'):
            # Read single part
            texts = parquet.read_table('./texts/textsTrain/' + name, columns = ['objectId','preprocessed']).to_pandas()            
            # Filter documents we need
            joined = idsOnly.join(texts.set_index('objectId'), how='inner', on = 'objectId')
            # Evaluate embeddings
            joined['embedding'] = joined.preprocessed.apply(doc2vec.infer_vector)
            # Memorize
            parts.append(joined[['embedding']])
            print('Done with ' + name)

Done with part-00000-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00001-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00002-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00003-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00004-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00005-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00006-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00007-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00008-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00009-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00010-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00011-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00012-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00013-1b50c8f5-87db-4a5

In [6]:
# Combine all the parts
train = data.join(pandas.concat(parts), on = 'objectId')
train

Unnamed: 0,objectId,label,embedding
0,22429313,0.0,"[-0.24245523, -0.8489216, 3.655461, 1.6661627,..."
1,14676953,0.0,"[-0.03079521, -1.0657399, 2.7699192, 1.3978746..."
2,11562101,0.0,"[-0.5163906, -0.17590548, 0.73324764, 0.335610..."
3,20892119,0.0,"[0.054822613, -1.1415586, 1.321706, 0.628749, ..."
4,16063005,0.0,"[-1.1584523, -0.9548521, 3.7516215, 1.412659, ..."
5,14576490,0.0,"[0.40271765, 0.0048528328, 0.19786699, 0.34216..."
6,11811947,0.0,"[-0.94024473, -0.22606388, 1.0989925, -0.13632..."
7,1936012,1.0,"[-0.33184797, -0.65457475, 0.6996024, -0.09389..."
8,20867189,0.0,"[-0.06507489, -1.6912346, 3.1151216, 0.2608389..."
9,28967185,0.0,"[-0.13529241, 0.06033241, 0.24128644, 0.114184..."


In [7]:
# Construct the label (liked objects)
y = train['label'].values

# Extract the most interesting features
X = numpy.stack(train['embedding'].values)

In [8]:
# Fit the model and check the weights
model = LogisticRegression(random_state=0, solver='lbfgs').fit(X, y)
model.coef_

array([[-0.12087508, -0.04141897, -0.07407981, -0.34242641, -0.06773843]])

In [22]:
# Weight the test documents
testTexts['weight'] = -model.predict_proba(numpy.stack(\
    testTexts.preprocessed.apply(lambda x : doc2vec.infer_vector(x))))[:, 1]
testTexts

Unnamed: 0,objectId,preprocessed,weight
0,517288,"[квартирник, нтв, маргулис, групп, пилот]",-0.173736
1,9501964,"[родител, очен, трогательн, песн, артур, халат]",-0.178630
2,23007371,"[сух, суперджет, откажет, западн, комплект, те...",-0.185714
3,38353886,"[сгорел, сара, гор, хат]",-0.185369
4,21192138,"[живодёр, отруб, лап, собак]",-0.189675
5,26415073,"[ажурн, маков, кулич, так, нежн, воздушн, мяки...",-0.116070
6,36734526,"[друг, уснул, бар, теб, нужн, тащ, дом, очен, ...",-0.162549
7,8699823,"[никифор, ден, дат, год, март, понедельник, др...",-0.169230
8,12236843,[],-0.177919
9,38393782,"[днр, лнр, новост, войск, берут, кольц, донбас...",-0.173095


In [30]:
# Read the test data
testData = parquet.read_table('./textsTest', columns = ['instanceId_userId','instanceId_objectId']).to_pandas() \
    .rename(columns = {'instanceId_objectId':'objectId'})
testData

Unnamed: 0,instanceId_userId,objectId
0,1006,34577503
1,1006,37520199
2,1618,546086
3,1618,546086
4,1618,546086
5,1618,35981492
6,1618,26764305
7,1810,958605
8,1810,20479574
9,1810,36254478


In [37]:
# Join test documents and elliminate possible duplicates
scores = testData.join(\
    testTexts[['objectId','weight']].set_index('objectId'), \
    how = 'inner', \
    on = 'objectId') \
    .groupby(['instanceId_userId','objectId']).max()
scores

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
instanceId_userId,objectId,Unnamed: 2_level_1
316,17997084,-0.120488
316,37758420,-0.169907
631,15478935,-0.148856
631,30513650,-0.192046
631,38118098,-0.172844
742,10672856,-0.132754
742,24302446,-0.169474
742,28816291,-0.181429
742,34685448,-0.176274
868,11640701,-0.172317


In [41]:
#  Sort for each user
result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
result

Unnamed: 0,instanceId_userId,objectId,weight
0,316,37758420,-0.169907
1,316,17997084,-0.120488
2,631,30513650,-0.192046
3,631,38118098,-0.172844
4,631,15478935,-0.148856
5,742,28816291,-0.181429
6,742,34685448,-0.176274
7,742,24302446,-0.169474
8,742,10672856,-0.132754
9,868,29650308,-0.173345


In [43]:
# Collect predictions for each user
submit = result.groupby("instanceId_userId")['objectId'].apply(list)
submit

instanceId_userId
316                                      [37758420, 17997084]
631                            [30513650, 38118098, 15478935]
742                  [28816291, 34685448, 24302446, 10672856]
868         [29650308, 35655697, 30143153, 11640701, 22115...
979                                       [37950972, 7996257]
1006                                     [37520199, 34577503]
1276                           [36856262, 31000576, 22812401]
1444                                     [36806487, 20963755]
1483                                     [34991228, 38036543]
1618                             [546086, 26764305, 35981492]
1651                           [34663244, 15380765, 23689616]
1810        [36704452, 10683958, 19146137, 20479574, 95860...
2215                             [37172748, 23658101, 951067]
2245                                     [12703014, 18355935]
2308                                      [12609961, 7642313]
2326                                     [29452603, 

In [44]:
# Persist the first submit
submit.to_csv("./textSubmit.csv.gz", header = False, compression='gzip')