In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

pd.options.display.max_columns = 500

In [4]:
import os
from fastapi import FastAPI, Depends
from sqlalchemy.orm import Session, sessionmaker, relationship
from sqlalchemy import create_engine, func, Text, Integer, Column, ForeignKey, TIMESTAMP
from sqlalchemy.ext.declarative import declarative_base
from typing import List, Optional
import datetime
from pydantic import BaseModel
from loguru import logger

In [5]:
# Create a URL object to connect to DB
SQLALCHEMY_DATABASE_URL = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"
# Create engine and link it to the URL
engine = create_engine(SQLALCHEMY_DATABASE_URL)
# Instantiate a Session maker object used to create sessions with required parameters
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Create a parental class Base from which other ORM classes will inherit
Base = declarative_base()

In [6]:
# Create function that facilitates downloading huge datasets to pandas dataframes by bathes/chunks
def batch_load_sql(query: str):
    engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")
    conn = engine.connect().execution_options(
        stream_results=True)
    chunks = []
    for chunk_df in pd.read_sql(query, conn, chunksize=200000):
        chunks.append(chunk_df)
        logger.info(f'Got chunk: {len(chunk_df)}')
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [7]:
posts_df = batch_load_sql(
    """
    SELECT *
    FROM public.post_text_df
    """)
    

2022-10-02 20:01:56.163 | INFO     | __main__:batch_load_sql:9 - Got chunk: 7023


In [8]:
users_df = batch_load_sql(
    """
    SELECT *
    FROM public.user_data
    """
)

2022-10-02 20:02:04.617 | INFO     | __main__:batch_load_sql:9 - Got chunk: 163205


In [9]:
users_df.describe()

Unnamed: 0,user_id,gender,age,exp_group
count,163205.0,163205.0,163205.0,163205.0
mean,85070.371759,0.551331,27.195405,1.997598
std,48971.63995,0.49736,10.239158,1.413644
min,200.0,0.0,14.0,0.0
25%,41030.0,0.0,19.0,1.0
50%,85511.0,1.0,24.0,2.0
75%,127733.0,1.0,33.0,3.0
max,168552.0,1.0,95.0,4.0


In [11]:
feed_df = batch_load_sql(
        """
        SELECT timestamp, user_id, post_id, action
        FROM (
                SELECT *,
                        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY user_id) as row_num
                FROM feed_data
            ) AS s
         WHERE row_num <= 50
        """
)

2022-10-02 20:05:01.809 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:05.252 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:08.376 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:11.317 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:14.013 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:16.748 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:19.380 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:22.141 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:24.726 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:27.376 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:30.311 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022-10-02 20:05:32.658 | INFO     | __main__:batch_load_sql:9 - Got chunk: 200000
2022

In [21]:
posts_features = posts_df.copy()

In [22]:
import nltk

In [23]:
from nltk.stem import WordNetLemmatizer

In [25]:
## Text preprocessing
import re
import string

from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer

wnl = WordNetLemmatizer()

def preprocessing(text, token=wnl):
    text = text.lower()
    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    
    while "  " in text:
        text = text.replace("  ", " ")
    
    text = text.strip()
        
    text = ' '.join([token.lemmatize(x) for x in text.split(' ')])
    return text


vectorizer = TfidfVectorizer(
    stop_words='english',
    preprocessor=preprocessing
)

In [26]:
tfidf_data = (
    vectorizer.fit_transform(posts_features['text']).toarray()
)

tfidf_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
tfidf_data.shape

(7023, 42044)

In [28]:
tfidf_data = pd.DataFrame(
    tfidf_data,
    index=posts_features.post_id,
    columns=vectorizer.get_feature_names_out()
)

tfidf_data

Unnamed: 0_level_0,aa,aaa,aaaaah,aaaahhhhhhh,aaaggghhhhhhh,aaah,aac,aadc,aahing,aai,aaila,aaja,aaliyah,aaliyahs,aaltra,aames,aamir,aanes,aankhen,aapl,aara,aaran,aardman,aarhus,aaron,aatish,aavjo,ab,aba,abacus,abandon,abandoned,abandoning,abandonment,abate,abatement,abating,abba,abbas,abbasi,abbey,abbot,abbott,abbotts,abbreviated,abbu,abby,abc,abd,abdellatif,abdicated,abdication,abdomen,abdominal,abduct,abducted,abduction,abductor,abdul,abdullah,abdullatif,abe,abebe,abedded,abel,abensur,aberavon,aberdeen,abernethie,aberration,aberystwyth,abetted,abetting,abeyance,abeyie,abeyies,abhijeetmonet,abhishe,abhishek,abhorrent,abi,abide,abided,abiding,abigail,ability,abish,abisheks,abiyote,abject,ablank,able,ablinken,ably,abn,abnegation,abnormal,abnormality,abo,aboard,aboards,abolish,abolished,abolishing,abolition,abominable,abomination,aboriginal,abort,aborted,abortion,abortionist,abortive,abou,abound,abounds,aboutbalasore,aboutroughly,abraham,abramovich,abramovichs,abridged,abroad,abrupt,abruptly,absa,absas,absconded,absence,absent,absentee,absolute,absolutely,absolution,absolutley,absorb,absorbed,absorbing,absorbs,abstain,abstention,abstract,abstraction,absurd,absurder,absurdist,absurdity,absurdly,absurdness,abt,abtahi,abu,abundance,abundantly,aburizal,abuse,abused,abuser,abusing,abusive,abut,abuzz,abysmal,abysmally,abyss,ac,acacia,acadamy,academia,academic,academically,academictwitter,academie,academy,acapulco,acasuso,acc,acce,accelerate,accelerated,accelerating,acceleration,accelerator,accent,accented,accentuate,accentuates,accentuating,accept,acceptable,acceptably,acceptance,accepted,accepting,accepts,access,accessed,accessibility,accessible,accessing,accession,accessory,accident,accidental,accidentally,accidently,acclaim,acclaimed,acclimate,acclimatise,accolade,accommodate,accommodated,accommodating,accommodation,accomodate,accompanied,accompanies,accompaniment,accompany,accompanying,accomplice,accomplish,accomplished,accomplishes,accomplishment,accord,accordance,according,accordingly,accordion,accorsi,accosiation,account,accountability,accountable,accountancy,accountant,accounted,accounting,accoustic,accra,accredited,accretive,accrued,accumulate,accumulated,accumulates,accumulating,accumulation,...,yodelling,yoffi,yoga,yojiro,yokel,yoko,yolande,yomiuri,yomuri,yon,yonca,yoo,yootha,yoran,yorans,yore,york,yorker,yorkregion,yorkshire,yosemite,yoshi,yoshiaki,yoshifumi,yoshino,yoshinoya,yosync,youcrave,youd,youe,yougov,youll,young,youngblood,younger,youngest,youngmans,youngster,younguns,youre,yourtv,youself,youssou,youth,youthful,youtube,youve,youwecan,youzhny,yoxall,yr,yself,yu,yuan,yuck,yudhoyonos,yue,yuen,yuganksneftegas,yugansk,yuganskneftegas,yuganskneftegaz,yuganskneftgas,yugansks,yugoslavia,yugoslavian,yuji,yuko,yukon,yukos,yukoss,yul,yule,yuletide,yulia,yum,yumiko,yumminot,yun,yungmin,yup,yuppie,yuppy,yuri,yury,yushchenko,yushchenkos,yusuf,yuvstrong,yvaine,yvette,yvonne,zabaleta,zabalza,zabriskie,zach,zack,zadie,zafi,zag,zahn,zaidi,zaire,zajec,zajecs,zakuani,zale,zalman,zambia,zambian,zander,zane,zang,zantaras,zanuck,zanussi,zany,zapata,zapped,zar,zara,zardari,zardine,zaslow,zatoichi,zaz,zazzle,zb,zdf,zdnet,zdravstvomk,ze,zeal,zealand,zealander,zealot,zealous,zealousness,zecchin,zechs,zed,zee,zeeuw,zeffirelli,zeh,zeitgeist,zelah,zelda,zeldas,zellweger,zellwegers,zelwegger,zemeckis,zen,zenden,zenia,zenith,zentropa,zep,zephaniah,zephyr,zeppelin,zerneck,zero,zeroni,zest,zesty,zeta,zetterqvist,zeus,zhang,zhaoxing,zheng,zib,zidane,zidanes,ziegfeld,ziers,zig,zigzagged,zika,zilch,zillion,zillonlife,zima,zimbabwe,zinc,zindulka,zine,zineb,zinedine,zing,zinger,zinneman,zionism,zip,zipless,zither,zithromax,ziyi,zmed,zodiac,zoe,zoellick,zoey,zoheb,zola,zomba,zombi,zombic,zombie,zombieapocalypseready,zomcom,zon,zone,zonealarm,zoned,zoo,zoolander,zoom,zooropa,zoot,zorba,zorina,zornotza,zorro,zosch,zsigmond,zuari,zubair,zucco,zuccos,zucker,zuckerman,zuckers,zues,zuf,zula,zully,zuluaga,zungia,zuniga,zurers,zurich,zurlini,zurlinis,zutons,zvonareva,zvyagintsev,zzzzzzzzz
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1,Unnamed: 353_level_1,Unnamed: 354_level_1,Unnamed: 355_level_1,Unnamed: 356_level_1,Unnamed: 357_level_1,Unnamed: 358_level_1,Unnamed: 359_level_1,Unnamed: 360_level_1,Unnamed: 361_level_1,Unnamed: 362_level_1,Unnamed: 363_level_1,Unnamed: 364_level_1,Unnamed: 365_level_1,Unnamed: 366_level_1,Unnamed: 367_level_1,Unnamed: 368_level_1,Unnamed: 369_level_1,Unnamed: 370_level_1,Unnamed: 371_level_1,Unnamed: 372_level_1,Unnamed: 373_level_1,Unnamed: 374_level_1,Unnamed: 375_level_1,Unnamed: 376_level_1,Unnamed: 377_level_1,Unnamed: 378_level_1,Unnamed: 379_level_1,Unnamed: 380_level_1,Unnamed: 381_level_1,Unnamed: 382_level_1,Unnamed: 383_level_1,Unnamed: 384_level_1,Unnamed: 385_level_1,Unnamed: 386_level_1,Unnamed: 387_level_1,Unnamed: 388_level_1,Unnamed: 389_level_1,Unnamed: 390_level_1,Unnamed: 391_level_1,Unnamed: 392_level_1,Unnamed: 393_level_1,Unnamed: 394_level_1,Unnamed: 395_level_1,Unnamed: 396_level_1,Unnamed: 397_level_1,Unnamed: 398_level_1,Unnamed: 399_level_1,Unnamed: 400_level_1,Unnamed: 401_level_1,Unnamed: 402_level_1,Unnamed: 403_level_1,Unnamed: 404_level_1,Unnamed: 405_level_1,Unnamed: 406_level_1,Unnamed: 407_level_1,Unnamed: 408_level_1,Unnamed: 409_level_1,Unnamed: 410_level_1,Unnamed: 411_level_1,Unnamed: 412_level_1,Unnamed: 413_level_1,Unnamed: 414_level_1,Unnamed: 415_level_1,Unnamed: 416_level_1,Unnamed: 417_level_1,Unnamed: 418_level_1,Unnamed: 419_level_1,Unnamed: 420_level_1,Unnamed: 421_level_1,Unnamed: 422_level_1,Unnamed: 423_level_1,Unnamed: 424_level_1,Unnamed: 425_level_1,Unnamed: 426_level_1,Unnamed: 427_level_1,Unnamed: 428_level_1,Unnamed: 429_level_1,Unnamed: 430_level_1,Unnamed: 431_level_1,Unnamed: 432_level_1,Unnamed: 433_level_1,Unnamed: 434_level_1,Unnamed: 435_level_1,Unnamed: 436_level_1,Unnamed: 437_level_1,Unnamed: 438_level_1,Unnamed: 439_level_1,Unnamed: 440_level_1,Unnamed: 441_level_1,Unnamed: 442_level_1,Unnamed: 443_level_1,Unnamed: 444_level_1,Unnamed: 445_level_1,Unnamed: 446_level_1,Unnamed: 447_level_1,Unnamed: 448_level_1,Unnamed: 449_level_1,Unnamed: 450_level_1,Unnamed: 451_level_1,Unnamed: 452_level_1,Unnamed: 453_level_1,Unnamed: 454_level_1,Unnamed: 455_level_1,Unnamed: 456_level_1,Unnamed: 457_level_1,Unnamed: 458_level_1,Unnamed: 459_level_1,Unnamed: 460_level_1,Unnamed: 461_level_1,Unnamed: 462_level_1,Unnamed: 463_level_1,Unnamed: 464_level_1,Unnamed: 465_level_1,Unnamed: 466_level_1,Unnamed: 467_level_1,Unnamed: 468_level_1,Unnamed: 469_level_1,Unnamed: 470_level_1,Unnamed: 471_level_1,Unnamed: 472_level_1,Unnamed: 473_level_1,Unnamed: 474_level_1,Unnamed: 475_level_1,Unnamed: 476_level_1,Unnamed: 477_level_1,Unnamed: 478_level_1,Unnamed: 479_level_1,Unnamed: 480_level_1,Unnamed: 481_level_1,Unnamed: 482_level_1,Unnamed: 483_level_1,Unnamed: 484_level_1,Unnamed: 485_level_1,Unnamed: 486_level_1,Unnamed: 487_level_1,Unnamed: 488_level_1,Unnamed: 489_level_1,Unnamed: 490_level_1,Unnamed: 491_level_1,Unnamed: 492_level_1,Unnamed: 493_level_1,Unnamed: 494_level_1,Unnamed: 495_level_1,Unnamed: 496_level_1,Unnamed: 497_level_1,Unnamed: 498_level_1,Unnamed: 499_level_1,Unnamed: 500_level_1,Unnamed: 501_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052302,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
[f"DistanceTo{ith}thCluster" for ith in range(1, 11)]

['DistanceTo1thCluster',
 'DistanceTo2thCluster',
 'DistanceTo3thCluster',
 'DistanceTo4thCluster',
 'DistanceTo5thCluster',
 'DistanceTo6thCluster',
 'DistanceTo7thCluster',
 'DistanceTo8thCluster',
 'DistanceTo9thCluster',
 'DistanceTo10thCluster']

In [30]:
### Пытаемся кластеризовать тексты

from sklearn.decomposition import PCA

centered = tfidf_data - tfidf_data.mean()

pca = PCA(n_components=15)
pca_decomp = pca.fit_transform(centered)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0).fit(pca_decomp)

posts_features['TextCluster'] = kmeans.labels_

dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster']

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,0.426586,0.444366,0.452641,0.419371,0.407254,0.480055,0.569855,0.284186,0.53109,0.475146
1,0.291237,0.31099,0.316231,0.264601,0.242727,0.18772,0.476287,0.166949,0.422299,0.348385
2,0.331665,0.346235,0.34927,0.316535,0.289395,0.424072,0.498222,0.113295,0.457312,0.373276
3,0.330233,0.321524,0.339407,0.299711,0.274803,0.392143,0.45014,0.119455,0.433026,0.366285
4,0.223714,0.218939,0.237063,0.210986,0.118696,0.323611,0.420762,0.153735,0.361846,0.271969


In [31]:
posts_features = pd.concat((posts_features, dists_df), axis=1)

In [32]:
posts_features.head()

Unnamed: 0,post_id,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,1,UK economy facing major risks\n\nThe UK manufa...,business,7,0.426586,0.444366,0.452641,0.419371,0.407254,0.480055,0.569855,0.284186,0.53109,0.475146
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,7,0.291237,0.31099,0.316231,0.264601,0.242727,0.18772,0.476287,0.166949,0.422299,0.348385
2,3,Asian quake hits European shares\n\nShares in ...,business,7,0.331665,0.346235,0.34927,0.316535,0.289395,0.424072,0.498222,0.113295,0.457312,0.373276
3,4,India power shares jump on debut\n\nShares in ...,business,7,0.330233,0.321524,0.339407,0.299711,0.274803,0.392143,0.45014,0.119455,0.433026,0.366285
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,4,0.223714,0.218939,0.237063,0.210986,0.118696,0.323611,0.420762,0.153735,0.361846,0.271969


In [33]:
df = pd.merge(feed_df, posts_features,
              how='left', 
              on='post_id')
df.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster
0,2021-12-29 15:18:42,200,994,view,Lib Dems highlight problem debt\n\nPeople vuln...,politics,5,0.493958,0.49978,0.498378,0.481224,0.455998,0.163969,0.6143,0.444176,0.579048,0.516398
1,2021-12-29 15:16:48,200,1670,view,Federer claims Dubai crown\n\nWorld number one...,sport,0,0.222183,0.395421,0.348419,0.399323,0.320338,0.445638,0.544996,0.376538,0.493987,0.35239
2,2021-12-29 15:16:35,200,1649,view,Brentford v Southampton\n\nGriffin Park\n\nTue...,sport,0,0.080808,0.325031,0.31489,0.353406,0.244865,0.41455,0.495929,0.311505,0.432167,0.340326
3,2021-12-29 15:15:21,200,2680,view,Don’t be deceived by cases down 6% for the wee...,covid,4,0.280805,0.259116,0.292403,0.307807,0.202112,0.386911,0.256535,0.251379,0.369502,0.324738
4,2021-12-29 15:14:32,200,1335,view,UKIP could sue Veritas defectors\n\nThe UK Ind...,politics,5,0.310467,0.333349,0.329885,0.325281,0.262486,0.127745,0.505744,0.293465,0.43782,0.355303


In [34]:
### А еще научимся выделять признаки
### Из timestamp!
### Согласитесь, от времени просмотра может зависеть
### Склонность пользователей лайкать или игнорировать посты

df['hour'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.hour)
df['month'] = pd.to_datetime(df['timestamp']).apply(lambda x: x.month)

df.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month
0,2021-12-29 15:18:42,200,994,view,Lib Dems highlight problem debt\n\nPeople vuln...,politics,5,0.493958,0.49978,0.498378,0.481224,0.455998,0.163969,0.6143,0.444176,0.579048,0.516398,15,12
1,2021-12-29 15:16:48,200,1670,view,Federer claims Dubai crown\n\nWorld number one...,sport,0,0.222183,0.395421,0.348419,0.399323,0.320338,0.445638,0.544996,0.376538,0.493987,0.35239,15,12
2,2021-12-29 15:16:35,200,1649,view,Brentford v Southampton\n\nGriffin Park\n\nTue...,sport,0,0.080808,0.325031,0.31489,0.353406,0.244865,0.41455,0.495929,0.311505,0.432167,0.340326,15,12
3,2021-12-29 15:15:21,200,2680,view,Don’t be deceived by cases down 6% for the wee...,covid,4,0.280805,0.259116,0.292403,0.307807,0.202112,0.386911,0.256535,0.251379,0.369502,0.324738,15,12
4,2021-12-29 15:14:32,200,1335,view,UKIP could sue Veritas defectors\n\nThe UK Ind...,politics,5,0.310467,0.333349,0.329885,0.325281,0.262486,0.127745,0.505744,0.293465,0.43782,0.355303,15,12


In [35]:
df_ = df.copy()

In [36]:
df_['id'] = df['user_id'].apply(str) + '_' + df['post_id'].apply(str)
df_.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,id
0,2021-12-29 15:18:42,200,994,view,Lib Dems highlight problem debt\n\nPeople vuln...,politics,5,0.493958,0.49978,0.498378,0.481224,0.455998,0.163969,0.6143,0.444176,0.579048,0.516398,15,12,200_994
1,2021-12-29 15:16:48,200,1670,view,Federer claims Dubai crown\n\nWorld number one...,sport,0,0.222183,0.395421,0.348419,0.399323,0.320338,0.445638,0.544996,0.376538,0.493987,0.35239,15,12,200_1670
2,2021-12-29 15:16:35,200,1649,view,Brentford v Southampton\n\nGriffin Park\n\nTue...,sport,0,0.080808,0.325031,0.31489,0.353406,0.244865,0.41455,0.495929,0.311505,0.432167,0.340326,15,12,200_1649
3,2021-12-29 15:15:21,200,2680,view,Don’t be deceived by cases down 6% for the wee...,covid,4,0.280805,0.259116,0.292403,0.307807,0.202112,0.386911,0.256535,0.251379,0.369502,0.324738,15,12,200_2680
4,2021-12-29 15:14:32,200,1335,view,UKIP could sue Veritas defectors\n\nThe UK Ind...,politics,5,0.310467,0.333349,0.329885,0.325281,0.262486,0.127745,0.505744,0.293465,0.43782,0.355303,15,12,200_1335


In [37]:
df_ = df_.sort_values('timestamp', ascending=True)
df_ = df_[~df_.index.duplicated(keep='last')]
df_.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,id
3143200,2021-10-01 06:01:40,66609,3270,view,Even the statues in japan wear masks so you ca...,covid,1,0.280066,0.065483,0.276124,0.305655,0.178358,0.397917,0.395493,0.266062,0.322408,0.305752,6,10,66609_3270
763050,2021-10-01 06:01:40,15471,2810,view,@MusicMX4 @MrAndyNgo I thought #MAGA was a 201...,covid,4,0.214402,0.182987,0.201694,0.267942,0.082693,0.356306,0.425873,0.235429,0.341294,0.224357,6,10,15471_2810
6543213,2021-10-01 06:01:40,136194,1205,view,Jack Cunningham to stand down\n\nVeteran Labou...,politics,5,0.397813,0.409351,0.409269,0.411037,0.355927,0.073197,0.553144,0.360786,0.505182,0.433291,6,10,136194_1205
3421889,2021-10-01 06:01:52,72228,1457,view,Navratilova hits out at critics\n\nMartina Nav...,sport,0,0.077552,0.249735,0.239572,0.289734,0.149401,0.364179,0.452194,0.255781,0.383354,0.267293,6,10,72228_1457
7841860,2021-10-01 06:01:52,162181,3096,view,U.S. ambassador to Iceland Jeffrey Ross Gunter...,covid,4,0.215663,0.174034,0.216308,0.257157,0.074156,0.350378,0.413446,0.216969,0.33925,0.253221,6,10,162181_3096


In [38]:
df_['action'] = np.where(df_['action'] == 'like', 1, 0)
df_.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,id
3143200,2021-10-01 06:01:40,66609,3270,0,Even the statues in japan wear masks so you ca...,covid,1,0.280066,0.065483,0.276124,0.305655,0.178358,0.397917,0.395493,0.266062,0.322408,0.305752,6,10,66609_3270
763050,2021-10-01 06:01:40,15471,2810,0,@MusicMX4 @MrAndyNgo I thought #MAGA was a 201...,covid,4,0.214402,0.182987,0.201694,0.267942,0.082693,0.356306,0.425873,0.235429,0.341294,0.224357,6,10,15471_2810
6543213,2021-10-01 06:01:40,136194,1205,0,Jack Cunningham to stand down\n\nVeteran Labou...,politics,5,0.397813,0.409351,0.409269,0.411037,0.355927,0.073197,0.553144,0.360786,0.505182,0.433291,6,10,136194_1205
3421889,2021-10-01 06:01:52,72228,1457,0,Navratilova hits out at critics\n\nMartina Nav...,sport,0,0.077552,0.249735,0.239572,0.289734,0.149401,0.364179,0.452194,0.255781,0.383354,0.267293,6,10,72228_1457
7841860,2021-10-01 06:01:52,162181,3096,0,U.S. ambassador to Iceland Jeffrey Ross Gunter...,covid,4,0.215663,0.174034,0.216308,0.257157,0.074156,0.350378,0.413446,0.216969,0.33925,0.253221,6,10,162181_3096


In [39]:
df_.shape

(8160250, 20)

In [40]:
df_ = pd.merge(df_, users_df, 
               how='left', 
               on='user_id')
df_.shape

(8160250, 27)

In [41]:
df_.head()

Unnamed: 0,timestamp,user_id,post_id,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,id,gender,age,country,city,exp_group,os,source
0,2021-10-01 06:01:40,66609,3270,0,Even the statues in japan wear masks so you ca...,covid,1,0.280066,0.065483,0.276124,0.305655,0.178358,0.397917,0.395493,0.266062,0.322408,0.305752,6,10,66609_3270,1,47,Russia,Volsk,4,Android,ads
1,2021-10-01 06:01:40,15471,2810,0,@MusicMX4 @MrAndyNgo I thought #MAGA was a 201...,covid,4,0.214402,0.182987,0.201694,0.267942,0.082693,0.356306,0.425873,0.235429,0.341294,0.224357,6,10,15471_2810,1,21,Russia,Tambov,4,iOS,ads
2,2021-10-01 06:01:40,136194,1205,0,Jack Cunningham to stand down\n\nVeteran Labou...,politics,5,0.397813,0.409351,0.409269,0.411037,0.355927,0.073197,0.553144,0.360786,0.505182,0.433291,6,10,136194_1205,0,24,Russia,Moscow,3,iOS,organic
3,2021-10-01 06:01:52,72228,1457,0,Navratilova hits out at critics\n\nMartina Nav...,sport,0,0.077552,0.249735,0.239572,0.289734,0.149401,0.364179,0.452194,0.255781,0.383354,0.267293,6,10,72228_1457,1,21,Russia,Murom,3,iOS,ads
4,2021-10-01 06:01:52,162181,3096,0,U.S. ambassador to Iceland Jeffrey Ross Gunter...,covid,4,0.215663,0.174034,0.216308,0.257157,0.074156,0.350378,0.413446,0.216969,0.33925,0.253221,6,10,162181_3096,0,17,Russia,Lensk,4,Android,organic


In [42]:
df_ = df_.sort_values('user_id')
df_ = df_.set_index(['user_id', 'post_id'])

In [43]:
df_.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,action,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,id,gender,age,country,city,exp_group,os,source
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
200,2121,2021-12-24 14:02:51,0,US blogger fired by her airline\n\nA US airlin...,tech,7,0.260836,0.295062,0.285212,0.239522,0.208249,0.328989,0.468163,0.141739,0.411495,0.313986,14,12,200_2121,1,34,Russia,Degtyarsk,3,Android,ads
200,4723,2021-12-24 13:54:34,1,The TV ads for this movie showed the warlocks ...,movie,9,0.257716,0.261975,0.201641,0.288501,0.143663,0.36956,0.460427,0.278727,0.396328,0.083967,13,12,200_4723,1,34,Russia,Degtyarsk,3,Android,ads
200,1335,2021-12-29 15:14:32,0,UKIP could sue Veritas defectors\n\nThe UK Ind...,politics,5,0.310467,0.333349,0.329885,0.325281,0.262486,0.127745,0.505744,0.293465,0.43782,0.355303,15,12,200_1335,1,34,Russia,Degtyarsk,3,Android,ads
200,7042,2021-12-24 14:00:21,0,"The Brothers Quay are directors, judging by co...",movie,2,0.253907,0.240507,0.095663,0.288961,0.125242,0.379669,0.449162,0.272223,0.382642,0.21095,14,12,200_7042,1,34,Russia,Degtyarsk,3,Android,ads
200,4420,2021-12-24 13:43:12,0,I got stuck in traffic (I live in Sicily) on t...,movie,9,0.417558,0.458418,0.387908,0.463956,0.387064,0.504476,0.577366,0.457922,0.536436,0.293131,13,12,200_4420,1,34,Russia,Degtyarsk,3,Android,ads
200,3895,2021-12-29 14:50:37,0,#ICYMI: As part of our 13th Dialogues in Diver...,covid,4,0.222189,0.182845,0.220205,0.267956,0.077893,0.362287,0.410662,0.241207,0.347625,0.256962,14,12,200_3895,1,34,Russia,Degtyarsk,3,Android,ads
200,1689,2021-12-24 13:49:29,0,McClaren hails Boros Uefa spirit\n\nMiddlesbro...,sport,0,0.119404,0.34736,0.308467,0.325536,0.267037,0.389522,0.520924,0.310099,0.455152,0.320721,13,12,200_1689,1,34,Russia,Degtyarsk,3,Android,ads
200,6087,2021-12-29 14:50:01,0,Those reviewers who have complained that this ...,movie,4,0.20583,0.206933,0.162868,0.253217,0.038801,0.336118,0.428128,0.226006,0.361652,0.184541,14,12,200_6087,1,34,Russia,Degtyarsk,3,Android,ads
200,2629,2021-12-24 13:44:22,0,Why would anyone po$$ibly de$cribe #COVID19 as...,covid,1,0.259552,0.049353,0.249384,0.292817,0.15811,0.381181,0.381427,0.273814,0.313992,0.276575,13,12,200_2629,1,34,Russia,Degtyarsk,3,Android,ads
200,5181,2021-12-29 15:04:47,0,I saw this film when it was originally release...,movie,2,0.307381,0.304472,0.071731,0.327143,0.213382,0.420948,0.485808,0.326424,0.422883,0.267053,15,12,200_5181,1,34,Russia,Degtyarsk,3,Android,ads


In [44]:
### Уберем все ненужные колонки

df_ = df_.drop([
#    'timestamp',  ### timestamp пока оставим
    'id',
    'text',
],
    axis=1)

df_.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,action,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,hour,month,gender,age,country,city,exp_group,os,source
user_id,post_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
200,2121,2021-12-24 14:02:51,0,tech,7,0.260836,0.295062,0.285212,0.239522,0.208249,0.328989,0.468163,0.141739,0.411495,0.313986,14,12,1,34,Russia,Degtyarsk,3,Android,ads
200,4723,2021-12-24 13:54:34,1,movie,9,0.257716,0.261975,0.201641,0.288501,0.143663,0.36956,0.460427,0.278727,0.396328,0.083967,13,12,1,34,Russia,Degtyarsk,3,Android,ads
200,1335,2021-12-29 15:14:32,0,politics,5,0.310467,0.333349,0.329885,0.325281,0.262486,0.127745,0.505744,0.293465,0.43782,0.355303,15,12,1,34,Russia,Degtyarsk,3,Android,ads
200,7042,2021-12-24 14:00:21,0,movie,2,0.253907,0.240507,0.095663,0.288961,0.125242,0.379669,0.449162,0.272223,0.382642,0.21095,14,12,1,34,Russia,Degtyarsk,3,Android,ads
200,4420,2021-12-24 13:43:12,0,movie,9,0.417558,0.458418,0.387908,0.463956,0.387064,0.504476,0.577366,0.457922,0.536436,0.293131,13,12,1,34,Russia,Degtyarsk,3,Android,ads
200,3895,2021-12-29 14:50:37,0,covid,4,0.222189,0.182845,0.220205,0.267956,0.077893,0.362287,0.410662,0.241207,0.347625,0.256962,14,12,1,34,Russia,Degtyarsk,3,Android,ads
200,1689,2021-12-24 13:49:29,0,sport,0,0.119404,0.34736,0.308467,0.325536,0.267037,0.389522,0.520924,0.310099,0.455152,0.320721,13,12,1,34,Russia,Degtyarsk,3,Android,ads
200,6087,2021-12-29 14:50:01,0,movie,4,0.20583,0.206933,0.162868,0.253217,0.038801,0.336118,0.428128,0.226006,0.361652,0.184541,14,12,1,34,Russia,Degtyarsk,3,Android,ads
200,2629,2021-12-24 13:44:22,0,covid,1,0.259552,0.049353,0.249384,0.292817,0.15811,0.381181,0.381427,0.273814,0.313992,0.276575,13,12,1,34,Russia,Degtyarsk,3,Android,ads
200,5181,2021-12-29 15:04:47,0,movie,2,0.307381,0.304472,0.071731,0.327143,0.213382,0.420948,0.485808,0.326424,0.422883,0.267053,15,12,1,34,Russia,Degtyarsk,3,Android,ads


In [45]:
df_.shape

(8160250, 23)

In [47]:
### Как валидировать? Как разобьем на train и test?
### Предлагаю по времени, так как данные имеют 
### Временную структуру! Хотим корректно оценивать
### Вероятности для будущих рекомендаций

max(df_.timestamp), min(df_.timestamp)

(Timestamp('2021-12-29 23:51:06'), Timestamp('2021-10-01 06:01:40'))

In [48]:
### За отсечку возьмем 2021-12-15

df_train = df_[df_.timestamp < '2021-12-01']
df_val = df_[('2021-12-01' <= df_.timestamp) & (df_.timestamp < '2021-12-15')]
df_test = df_[df_.timestamp >= '2021-12-15']

df_train = df_train.drop('timestamp', axis=1)
df_val = df_val.drop('timestamp', axis=1)
df_test = df_test.drop('timestamp', axis=1)

X_train = df_train.drop('action', axis=1)
X_val = df_val.drop('action', axis=1)
X_test = df_test.drop('action', axis=1)

y_train = df_train['action']
y_val = df_val['action']
y_test = df_test['action']

y_train.shape, y_val.shape, y_test.shape

((5611054,), (1032117,), (1517079,))

In [50]:
## Encode categorical data

from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder

cat_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

cols_for_ohe = [x for x in cat_cols if X_train[x].nunique() < 5]
cols_for_mte = [x for x in cat_cols if X_train[x].nunique() >= 5]

### Cохраним индексы этих колонок

cols_for_ohe_idx = [list(X_train.columns).index(col) for col in cols_for_ohe]
cols_for_mte_idx = [list(X_train.columns).index(col) for col in cols_for_mte]

transformer = [
    ('OneHotEncoder', OneHotEncoder(), cols_for_ohe_idx),
    ('MeanTargetEncoder', TargetEncoder(), cols_for_mte_idx)
]

col_transform = ColumnTransformer(transformers=transformer)

In [51]:
### Теперь обучим катбуст!

# from catboost import CatBoostClassifier

# catboost = CatBoostClassifier(iterations=100,
#                               learning_rate=1,
#                               depth=3, 
#                               task_type='GPU', 
#                               devices='0')

# catboost.fit(X_train, y_train, cat_cols)

In [55]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

df_train_2 = (df_[df_.timestamp < '2021-12-15']).drop('timestamp', axis=1)
df_test_2 = (df_[df_.timestamp >= '2021-12-15']).drop('timestamp', axis=1)

X_train_2= df_train_2.drop('action', axis=1)
X_test_2 = df_test_2.drop('action', axis=1)

y_train_2 = df_train_2['action']
y_test_2 = df_test_2['action']

depths = [2, 3, 4, 5],
learning_rates = [1e-3, 1e-2, 0.1, 1],
iterations = [50, 60, 70, 80, 90, 100]

        model = CatBoostClassifier(
                iterations=70,
                learning_rate=learning_rate,
                depth=depth,
                cat_features=cat_cols,
                task_type='GPU',
                devices='0'
            )
        model.fit(X_train_2, y_train_2)
        print(f'Depth: {depth}, learning_rate: {learning_rate}')
        print(f"ROC-AUC score on train: {roc_auc_score(y_train_2, catboost.predict_proba(X_train_2)[:, 1])}")
        print(f"ROC-AUC score on test: {roc_auc_score(y_test_2, catboost.predict_proba(X_test_2)[:, 1])}")
        print('____________________________')
            


CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/json_helper.h:171: Can't parse parameter "learning_rate" with value: [0.001,0.01,0.1,1]

from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

parameters = {
    'depth': [2, 3, 4, 5],
    'learning_rate': [1e-3, 1e-2, 0.1, 1],
    'iterations': [50, 60, 70, 80, 90, 100]
}

catboost_2 = CatBoostClassifier(cat_features=cat_cols)

catboost_2_search = GridSearchCV(
    estimator=catboost_2,
    param_grid = parameters, 
    cv = (train_idx, val_idx), 
    n_jobs=-1
)
catboost_2_search.fit(X_train, y_train)

In [None]:
### Замерим качество работы такой модели
### Возьмем ROC-AUC
from sklearn.metrics import roc_auc_score


print(f"Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}")
print(f"Качество на тесте: {roc_auc_score(y_val, catboost.predict_proba(X_val)[:, 1])}")

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_feature_importance(importance, names, model_type):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
plot_feature_importance(catboost.feature_importances_,X_train.columns,'Catboost')

In [None]:
### Save the model

catboost.save_model(
    'catboost_model',
    format="cbm"                  
)

In [None]:
### Put the posts_features to DataBase.

posts_features.to_sql(    
   "pg_posts_features",                    
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml",                      
    schema="public",                   
    if_exists='replace'            
   )                              
        

In [None]:
### Ensure if post_features have been correctly uploaded.

test_ = pd.read_sql(
    """SELECT * FROM public.pg_posts_features""",
    
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

test_

In [None]:
cols = users_df[users_df['user_id'] == 200].drop('user_id', axis=1).columns
cols

In [None]:
vals = users_df[users_df['user_id'] == 200].drop('user_id', axis=1).values[0]
vals

In [None]:
a = dict(zip(cols, vals))
a

In [None]:
u = posts_features.assign(**a)
u

In [None]:
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_model')

In [None]:
loaded_model.get_all_params()

In [None]:
loaded_model.predict_proba(X_test)[:, 1]

In [None]:
loaded_model.get_params()

In [None]:
id_1000 = pd.read_sql(
    """
    SELECT *
    FROM public.user_data
    WHERE user_id = 1000
    """,
    con=engine
)
id_1000

In [None]:
f = pd.read_sql(
    """
    SELECT *
    FROM public.pg_posts_features
    """,
    con=engine
)
f

In [None]:
content = f[['post_id', 'text', 'topic']]
f = f.drop(['index', 'text'], axis=1)
f

In [None]:
liked_posts = pd.read_sql(
    """
    SELECT DISTINCT post_id
    FROM public.feed_data
    WHERE (user_id = 1000) AND (action = 'like')
    """,
    con=engine)

liked_posts

In [None]:
id_1000 = id_1000.drop('user_id', axis=1)
id_1000

In [None]:
add_user_features = dict(zip(id_1000.columns, id_1000.values[0]))
add_user_features

In [None]:
f = f.assign(**add_user_features)
f = f.set_index('post_id')
f

In [None]:
content

In [None]:
time = datetime.datetime(2021, 12, 25)
time

In [None]:
f['hour'] = time.hour
f['month'] = time.month
f

In [None]:
filtered_f = f[~f.index.isin(liked_posts.post_id.values)]
filtered_f

In [None]:
x = filtered_f
x

In [None]:
catboost_2 = CatBoostClassifier(iterations=100,
                              learning_rate=1,
                              depth=3, 
                              task_type='GPU', 
                              devices='0')

catboost.fit(X_train, y_train, cat_cols)