In [1]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

load_dotenv()
# Read connection config from .env
connection = os.environ.get("POSTGRES_ENGINE")

engine = create_engine(connection)

# Data processing

In [33]:
# Read user_data and post_text tables
user_data = pd.read_sql('SELECT * FROM public.user_data', con=engine)
post_info = pd.read_sql('SELECT * FROM public.post_text_df', con=engine)

In [3]:
# Funtion to perform sql query in batches in order to avoid memory limits of server
def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(connection)
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [None]:
# Load feed_data (long wait time)
feed_data = batch_load_sql('SELECT * FROM public.feed_data limit 10000000')
feed_data.shape

In [None]:
# Save to csv to save time for future data loading
feed_data.to_csv('feed_data', index=False)

In [23]:
# Load from csv if saved previously
feed_data = pd.read_csv('feed_data')

In [34]:
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-11-11 15:26:25,115699,5891,view,0
1,2021-11-11 15:29:04,115699,3722,view,0
2,2021-11-11 15:29:16,115699,1806,view,0
3,2021-11-11 15:30:14,115699,34,view,0
4,2021-11-11 15:31:12,115699,3335,view,0


In [35]:
feed_data = feed_data.drop(feed_data[feed_data.target == 1].index)
feed_data.loc[feed_data['action'] == 'like', 'target'] = 1
feed_data = feed_data.drop('action', axis=1)
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,target
0,2021-11-11 15:26:25,115699,5891,0
1,2021-11-11 15:29:04,115699,3722,0
2,2021-11-11 15:29:16,115699,1806,0
3,2021-11-11 15:30:14,115699,34,0
4,2021-11-11 15:31:12,115699,3335,0


In [36]:
# Post's text processing with nltk and TfIdf
import re
import string

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

wnl = WordNetLemmatizer()


def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([token.lemmatize(x) for x in line.split(' ')])
    return line


tfidf = TfidfVectorizer(
    stop_words='english',
    preprocessor=preprocessing
)

In [37]:
tfidf_data = (
    tfidf
    .fit_transform(post_info['text'])
    .toarray()
)

tfidf_data = pd.DataFrame(
    tfidf_data,
    index=post_info.post_id,
    columns=tfidf.get_feature_names_out()
)



In [38]:
# Create features based on TfIdf
post_info['TotalTfIdf'] = tfidf_data.sum(axis=1).reset_index()[0]
post_info['MaxTfIdf'] = tfidf_data.max(axis=1).reset_index()[0]
post_info['MeanTfIdf'] = tfidf_data.mean(axis=1).reset_index()[0]

post_info.head()

Unnamed: 0,post_id,text,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf
0,1,UK economy facing major risks\n\nThe UK manufa...,business,8.748129,0.495805,0.00019
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,11.878472,0.308003,0.000258
2,3,Asian quake hits European shares\n\nShares in ...,business,12.67553,0.261799,0.000276
3,4,India power shares jump on debut\n\nShares in ...,business,6.622786,0.537713,0.000144
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6.352096,0.420251,0.000138


In [39]:
#Create clusters

from sklearn.decomposition import PCA

centered = tfidf_data - tfidf_data.mean()

pca = PCA(n_components=20)
pca_decomp = pca.fit_transform(centered)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0).fit(pca_decomp)

post_info['TextCluster'] = kmeans.labels_

dists_columns = [f"DistanceTo{ith}thCluster" for ith in range(1, 11)]

dists_df = pd.DataFrame(data=kmeans.transform(pca_decomp), columns=dists_columns)

dists_df.head()

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,0.4432,0.478396,0.451612,0.435837,0.498823,0.437494,0.521054,0.330863,0.527461,0.537637
1,0.296991,0.339678,0.308105,0.273176,0.370485,0.281348,0.252152,0.155099,0.409704,0.434167
2,0.329922,0.361063,0.334318,0.318718,0.384941,0.308293,0.462507,0.175753,0.431099,0.468838
3,0.290443,0.318269,0.272464,0.261157,0.346719,0.254609,0.4103,0.115072,0.381911,0.431798
4,0.223669,0.257254,0.197249,0.220332,0.289619,0.164602,0.37176,0.121962,0.341579,0.38858


In [40]:
post_info = pd.concat((post_info, dists_df), axis=1)
post_info = post_info.drop('text', axis=1)

post_info.head()

Unnamed: 0,post_id,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,1,business,8.748129,0.495805,0.00019,7,0.4432,0.478396,0.451612,0.435837,0.498823,0.437494,0.521054,0.330863,0.527461,0.537637
1,2,business,11.878472,0.308003,0.000258,7,0.296991,0.339678,0.308105,0.273176,0.370485,0.281348,0.252152,0.155099,0.409704,0.434167
2,3,business,12.67553,0.261799,0.000276,7,0.329922,0.361063,0.334318,0.318718,0.384941,0.308293,0.462507,0.175753,0.431099,0.468838
3,4,business,6.622786,0.537713,0.000144,7,0.290443,0.318269,0.272464,0.261157,0.346719,0.254609,0.4103,0.115072,0.381911,0.431798
4,5,business,6.352096,0.420251,0.000138,7,0.223669,0.257254,0.197249,0.220332,0.289619,0.164602,0.37176,0.121962,0.341579,0.38858


In [41]:
data = pd.merge(feed_data,
                post_info,
                on='post_id',
                how='left')

data = pd.merge(data,
                user_data,
                on='user_id',
                how='left')

data = data.set_index(['user_id', 'post_id'])

data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,target,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,...,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,gender,age,country,city,exp_group,os,source
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
115699,5891,2021-11-11 15:26:25,0,movie,9.295552,0.37235,0.000202,4,0.273415,0.186306,0.29469,...,0.295488,0.407293,0.424216,0,17,Russia,Yaroslavl,4,Android,organic
115699,3722,2021-11-11 15:29:04,0,covid,3.334907,0.377945,7.2e-05,2,0.252464,0.280438,0.052103,...,0.24621,0.298523,0.410573,0,17,Russia,Yaroslavl,4,Android,organic
115699,1806,2021-11-11 15:29:16,0,sport,11.597768,0.333857,0.000252,0,0.106828,0.260252,0.238966,...,0.239119,0.368559,0.388291,0,17,Russia,Yaroslavl,4,Android,organic
115699,34,2021-11-11 15:30:14,0,business,9.813704,0.274861,0.000213,7,0.322322,0.342413,0.314194,...,0.181363,0.421279,0.431956,0,17,Russia,Yaroslavl,4,Android,organic
115699,3335,2021-11-11 15:31:12,0,covid,2.729815,0.501286,5.9e-05,2,0.320389,0.326815,0.099675,...,0.301411,0.320111,0.446397,0,17,Russia,Yaroslavl,4,Android,organic


In [42]:
data['weekend'] = 0
data.loc[pd.to_datetime(data.timestamp).dt.dayofweek > 5, 'weekend'] = 1
data['hour'] = pd.to_datetime(data.timestamp).dt.hour
data.insert(len(data.columns) - 1, 'target', data.pop('target'))
data

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,...,gender,age,country,city,exp_group,os,source,weekend,hour,target
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
115699,5891,2021-11-11 15:26:25,movie,9.295552,0.372350,0.000202,4,0.273415,0.186306,0.294690,0.323835,...,0,17,Russia,Yaroslavl,4,Android,organic,0,15,0
115699,3722,2021-11-11 15:29:04,covid,3.334907,0.377945,0.000072,2,0.252464,0.280438,0.052103,0.300293,...,0,17,Russia,Yaroslavl,4,Android,organic,0,15,0
115699,1806,2021-11-11 15:29:16,sport,11.597768,0.333857,0.000252,0,0.106828,0.260252,0.238966,0.292623,...,0,17,Russia,Yaroslavl,4,Android,organic,0,15,0
115699,34,2021-11-11 15:30:14,business,9.813704,0.274861,0.000213,7,0.322322,0.342413,0.314194,0.301182,...,0,17,Russia,Yaroslavl,4,Android,organic,0,15,0
115699,3335,2021-11-11 15:31:12,covid,2.729815,0.501286,0.000059,2,0.320389,0.326815,0.099675,0.332633,...,0,17,Russia,Yaroslavl,4,Android,organic,0,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108908,1049,2021-10-21 20:10:09,politics,9.356786,0.354618,0.000203,6,0.381299,0.416364,0.401152,0.383118,...,0,25,Russia,Moscow,2,iOS,organic,0,20,0
108908,4724,2021-10-21 20:15:52,movie,5.893446,0.401411,0.000128,4,0.342960,0.278021,0.356239,0.382379,...,0,25,Russia,Moscow,2,iOS,organic,0,20,1
108908,1091,2021-10-21 20:15:54,politics,9.732873,0.604797,0.000212,6,0.332922,0.367044,0.333849,0.358410,...,0,25,Russia,Moscow,2,iOS,organic,0,20,0
108908,3086,2021-10-21 20:17:38,covid,3.238184,0.452428,0.000070,2,0.293016,0.306961,0.056760,0.324757,...,0,25,Russia,Moscow,2,iOS,organic,0,20,0


In [None]:
max(data.timestamp), min(data.timestamp)

# Catboost training

In [44]:
# Train/test split based on date
df_train = data[data.timestamp < '2021-12-12']
df_test = data[data.timestamp >= '2021-12-12']

df_train = df_train.drop('timestamp', axis=1)
df_test = df_test.drop('timestamp', axis=1)

X_train = df_train.drop('target', axis=1)
X_test = df_test.drop('target', axis=1)

y_train = df_train['target']
y_test = df_test['target']

y_test.shape[0] / feed_data.shape[0]

0.1970230544907325

In [45]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,...,DistanceTo10thCluster,gender,age,country,city,exp_group,os,source,weekend,hour
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
115699,5891,movie,9.295552,0.372350,0.000202,4,0.273415,0.186306,0.294690,0.323835,0.123434,...,0.424216,0,17,Russia,Yaroslavl,4,Android,organic,0,15
115699,3722,covid,3.334907,0.377945,0.000072,2,0.252464,0.280438,0.052103,0.300293,0.303800,...,0.410573,0,17,Russia,Yaroslavl,4,Android,organic,0,15
115699,1806,sport,11.597768,0.333857,0.000252,0,0.106828,0.260252,0.238966,0.292623,0.289698,...,0.388291,0,17,Russia,Yaroslavl,4,Android,organic,0,15
115699,34,business,9.813704,0.274861,0.000213,7,0.322322,0.342413,0.314194,0.301182,0.367904,...,0.431956,0,17,Russia,Yaroslavl,4,Android,organic,0,15
115699,3335,covid,2.729815,0.501286,0.000059,2,0.320389,0.326815,0.099675,0.332633,0.349299,...,0.446397,0,17,Russia,Yaroslavl,4,Android,organic,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108908,1049,politics,9.356786,0.354618,0.000203,6,0.381299,0.416364,0.401152,0.383118,0.436848,...,0.495987,0,25,Russia,Moscow,2,iOS,organic,0,20
108908,4724,movie,5.893446,0.401411,0.000128,4,0.342960,0.278021,0.356239,0.382379,0.177550,...,0.463016,0,25,Russia,Moscow,2,iOS,organic,0,20
108908,1091,politics,9.732873,0.604797,0.000212,6,0.332922,0.367044,0.333849,0.358410,0.392495,...,0.463956,0,25,Russia,Moscow,2,iOS,organic,0,20
108908,3086,covid,3.238184,0.452428,0.000070,2,0.293016,0.306961,0.056760,0.324757,0.334839,...,0.431764,0,25,Russia,Moscow,2,iOS,organic,0,20


In [46]:
object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'os', 'source',
    'weekend', 'hour'
]

In [48]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(iterations=1000,
                              learning_rate=1,
                              depth=2,
                              task_type='GPU')

catboost.fit(X_train, y_train, object_cols)

0:	learn: 0.3526883	total: 164ms	remaining: 2m 43s
1:	learn: 0.3469343	total: 309ms	remaining: 2m 34s
2:	learn: 0.3460925	total: 427ms	remaining: 2m 21s
3:	learn: 0.3455764	total: 553ms	remaining: 2m 17s
4:	learn: 0.3453323	total: 679ms	remaining: 2m 15s
5:	learn: 0.3448874	total: 816ms	remaining: 2m 15s
6:	learn: 0.3448091	total: 945ms	remaining: 2m 14s
7:	learn: 0.3442971	total: 1.07s	remaining: 2m 13s
8:	learn: 0.3442447	total: 1.21s	remaining: 2m 13s
9:	learn: 0.3441287	total: 1.35s	remaining: 2m 14s
10:	learn: 0.3438032	total: 1.48s	remaining: 2m 13s
11:	learn: 0.3431573	total: 1.61s	remaining: 2m 12s
12:	learn: 0.3430767	total: 1.74s	remaining: 2m 11s
13:	learn: 0.3430015	total: 1.89s	remaining: 2m 13s
14:	learn: 0.3429569	total: 2.06s	remaining: 2m 15s
15:	learn: 0.3420545	total: 2.18s	remaining: 2m 14s
16:	learn: 0.3419288	total: 2.31s	remaining: 2m 13s
17:	learn: 0.3415843	total: 2.43s	remaining: 2m 12s
18:	learn: 0.3414100	total: 2.56s	remaining: 2m 12s
19:	learn: 0.3411519	t

<catboost.core.CatBoostClassifier at 0x26e9c399be0>

In [49]:
from sklearn.metrics import roc_auc_score

print(f"Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:, 1])}")

Качество на трейне: 0.6843964344177648
Качество на тесте: 0.6628233802755337


In [50]:
catboost.save_model('catboost')

In [None]:
# Write processed data to sql
post_info.to_sql('m_mishin_features_lesson_22_post', con=engine, if_exists='replace', index=False)

# Prod workflow example

In [71]:
post_test = pd.read_sql('m_mishin_features_lesson_22_post', con=engine)
post_test.head()

Unnamed: 0,post_id,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,1,business,8.748129,0.495805,0.00019,7,0.4432,0.478396,0.451612,0.435837,0.498823,0.437494,0.521054,0.330863,0.527461,0.537637
1,2,business,11.878472,0.308003,0.000258,7,0.296991,0.339678,0.308105,0.273176,0.370485,0.281348,0.252152,0.155099,0.409704,0.434167
2,3,business,12.67553,0.261799,0.000276,7,0.329922,0.361063,0.334318,0.318718,0.384941,0.308293,0.462507,0.175753,0.431099,0.468838
3,4,business,6.622786,0.537713,0.000144,7,0.290443,0.318269,0.272464,0.261157,0.346719,0.254609,0.4103,0.115072,0.381911,0.431798
4,5,business,6.352096,0.420251,0.000138,7,0.223669,0.257254,0.197249,0.220332,0.289619,0.164602,0.37176,0.121962,0.341579,0.38858


In [72]:
exmp_user_id = 322
single_user_data = user_data.loc[user_data['user_id'] == exmp_user_id]

In [73]:
# Add user features
df = pd.merge(post_test, single_user_data, how='cross')
df = df.set_index(['user_id', 'post_id'])

# Add time features
time = pd.Timestamp.today()
df['weekend'] = time.weekday()

df['hour'] = time.hour

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,...,DistanceTo10thCluster,gender,age,country,city,exp_group,os,source,weekend,hour
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
322,1,business,8.748129,0.495805,0.00019,7,0.4432,0.478396,0.451612,0.435837,0.498823,...,0.537637,0,51,Russia,Tomsk,2,Android,ads,0,10
322,2,business,11.878472,0.308003,0.000258,7,0.296991,0.339678,0.308105,0.273176,0.370485,...,0.434167,0,51,Russia,Tomsk,2,Android,ads,0,10
322,3,business,12.67553,0.261799,0.000276,7,0.329922,0.361063,0.334318,0.318718,0.384941,...,0.468838,0,51,Russia,Tomsk,2,Android,ads,0,10
322,4,business,6.622786,0.537713,0.000144,7,0.290443,0.318269,0.272464,0.261157,0.346719,...,0.431798,0,51,Russia,Tomsk,2,Android,ads,0,10
322,5,business,6.352096,0.420251,0.000138,7,0.223669,0.257254,0.197249,0.220332,0.289619,...,0.38858,0,51,Russia,Tomsk,2,Android,ads,0,10


In [74]:
# Predict chance of post getting a like
df['preds'] = catboost.predict_proba(df)[:, 1]
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic,TotalTfIdf,MaxTfIdf,MeanTfIdf,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,...,gender,age,country,city,exp_group,os,source,weekend,hour,preds
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
322,1,business,8.748129,0.495805,0.00019,7,0.4432,0.478396,0.451612,0.435837,0.498823,...,0,51,Russia,Tomsk,2,Android,ads,0,10,0.179489
322,2,business,11.878472,0.308003,0.000258,7,0.296991,0.339678,0.308105,0.273176,0.370485,...,0,51,Russia,Tomsk,2,Android,ads,0,10,0.170375
322,3,business,12.67553,0.261799,0.000276,7,0.329922,0.361063,0.334318,0.318718,0.384941,...,0,51,Russia,Tomsk,2,Android,ads,0,10,0.241669
322,4,business,6.622786,0.537713,0.000144,7,0.290443,0.318269,0.272464,0.261157,0.346719,...,0,51,Russia,Tomsk,2,Android,ads,0,10,0.206758
322,5,business,6.352096,0.420251,0.000138,7,0.223669,0.257254,0.197249,0.220332,0.289619,...,0,51,Russia,Tomsk,2,Android,ads,0,10,0.109281


In [78]:
# Get ids of top 5 recommended posts
post_ids = list(df.sort_values('preds', ascending=False).head().reset_index().post_id)
post_ids

[342, 6306, 147, 4962, 532]

In [83]:
# Read post texts and topics from sql
post_texts = pd.read_sql('SELECT * FROM public.post_text_df', con=engine)

In [94]:
# Package recommendations into json format
import json

recs = post_texts.rename(columns={'post_id': 'id'}).set_index('id').loc[post_ids].reset_index()
recs = recs.to_json(orient='records')
recs = json.loads(recs)
recs

[{'id': 342,
  'text': 'Huge rush for Jet Airways shares\n\nIndian airline Jet Airways initial public offering was oversubscribed 16.2 times, bankers said on Friday.\n\nOver 85% of the bids were at the higher end of the price range of 1,050-1,125 rupees ($24-$26). Jet Airways, a low-fare airline, was founded by London-based ex-travel agent Naresh Goya, and controls 45% of the Indian domestic airline market. It sold 20% of its equity or 17.2 million shares in a bid to raise up to $443m (£230.8m). The price at which its shares will begin trading will be agreed over the weekend, bankers said. The demand for the IPO was impressive. We believe that over the next two years, the domestic aviation sector promises strong growth, even though fuel prices could be high, said Hiten Mehta, manager of merchant banking firm, Fortune Financial Services. India began to open up its domestic airline market - previously dominated by state-run carrier Indian Airlines - in the 1990s. Jet began flying in 1993