In [1]:
import pandas as pd
from sqlalchemy import create_engine

db_url = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"
engine = create_engine(db_url)

Для начала посмотрим, сколько записей в каждой из таблиц.

In [2]:
def check_table_size(table):
    query = f"""
    SELECT COUNT(*)
    FROM {table};
    """
    size = pd.read_sql(query, engine).iloc[0, 0]
    return size

def check_new_feed_data_size():
    query = """
    SELECT COUNT(*)
    FROM feed_data
    WHERE NOT (action = 'view' AND target = 1);
    """
    size = pd.read_sql(query, engine).iloc[0, 0]
    return size

In [3]:
# Это лучше не запускать)))

print("Количество пользователей:            ", f"{check_table_size('user_data'):8}")
print("Количество постов:                   ", f"{check_table_size('post_text_df'):8}")
# print("Количество взаимодействий с постами: ", f"{check_table_size('feed_data'):8}")
# print("Новое количество взаимодействий:     ", f"{check_new_feed_data_size():8}")

Количество пользователей:               163205
Количество постов:                        7023


Теперь необходимо написать функции, которые выгружают из таблиц данные (при этом обращаем внимание, что из таблицы взаимодействий необходимо выгрузить только 5_000_000 записей).

In [5]:
def load_user_data():
    query = """
    SELECT user_id, age, gender, city, country, exp_group, os, source
    FROM user_data;
    """

    conn = engine.connect().execution_options(stream_results=True)
    user_data = pd.read_sql(query, conn)
    conn.close()

    return user_data

def load_post_data():
    query = """
    SELECT post_id, text, topic
    FROM post_text_df;
    """

    conn = engine.connect().execution_options(stream_results=True)
    post_data = pd.read_sql(query, conn)
    conn.close()
    
    return post_data

def load_feed_data():
    CHUNKSIZE = 100000

    query = """
    SELECT timestamp, user_id, post_id,
    CASE 
        WHEN action = 'like' THEN 1
        ELSE target
    END AS target
    FROM feed_data
    WHERE NOT (action = 'view' AND target = 1)
    LIMIT 5000000;
    """

    conn = engine.connect().execution_options(stream_results=True)
    chunks = []

    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()

    return pd.concat(chunks, ignore_index=True)

In [6]:
user_data = load_user_data()
post_data = load_post_data()
feed_data = load_feed_data()

Далее необходимо ознакомиться с данными, узнать, какие колонки являются категориальными, сколько в них значений и как лучше их закодировать в данном случае.

In [7]:
user_data.head()

Unnamed: 0,user_id,age,gender,city,country,exp_group,os,source
0,200,34,1,Degtyarsk,Russia,3,Android,ads
1,201,37,0,Abakan,Russia,0,Android,ads
2,202,17,1,Smolensk,Russia,4,Android,ads
3,203,18,0,Moscow,Russia,1,iOS,ads
4,204,36,0,Anzhero-Sudzhensk,Russia,3,Android,ads


In [8]:
print("Количество уникальных возрастов: ", len(user_data['age'].unique()))
print("Возраст: ", sorted(user_data['age'].unique()))

Количество уникальных возрастов:  76
Возраст:  [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 92, 95]


Можно подумать над тем, чтобы разбить возраста по возрастным группам, но пока лучше оставить как есть, при чем некатегориальным признаком.

In [9]:
print("Количество уникальных гендеров: ", len(user_data['gender'].unique()))
print("Гендеры: ", user_data['gender'].unique())

Количество уникальных гендеров:  2
Гендеры:  [1 0]


In [10]:
print("Количество уникальных городов: ", len(user_data['city'].unique()))
print("Города: ", ", ".join(user_data['city'].unique()))

Количество уникальных городов:  3915
Города:  Degtyarsk, Abakan, Smolensk, Moscow, Anzhero-Sudzhensk, Dugulubgey, Kamenka, Groznyy, Zima, Yuzhnouralsk, Mykolaiv, Minsk, Podolsk, Kamianske, Horodenka, Lviv, Kansk, Novosibirsk, Vologda, Ulyanovsk, Chernihiv, Yekaterinburg, Vilyuysk, Saint Petersburg, Baku, Perm, Barnaul, Kursk, Volgograd, Troitsk, Ufa, Rostov, Saransk, Smarhon’, Zlatoust, Ekibastuz, Krasnoyarsk, Nizhniy Tagil, Jyväskylä, Lesosibirsk, Shakhty, Krasnodar, Kashin, Salavat, Tsimlyansk, Yelizovo, Rubtsovsk, Odintsovo, Pyatigorsk, Yelabuga, Gaziantep, Khabarovsk, Ağcabədi, Kislovodsk, Asipovichy, M.Ə. Rəsulzadə, Vladimir, Birobidzhan, Beloyarskiy, Tyumen, Murom, Omsk, Severodvinsk, Yuzhno-Sakhalinsk, Taganrog, Mahilyow, Artëm, Neftekamsk, Syktyvkar, Arzamas, Makhachkala, Vladivostok, Lipetsk, Stavropol, Tobolsk, Novokuznetsk, Velikiy Novgorod, Voronezh, Buguruslan, Novouralsk, Zelënodol’sk, Horokhiv, Berdsk, Murmansk, Dzerzhinskiy, Tomsk, Neman, Pushkino, Yessentuki, Novohrad-

Города нельзя закодировать OneHotEncoding, необходимо подумать, стоит ли нам вообще оставлять города. Аналогично необходимо подумать, раз есть города, нужно ли оставлять страны.

In [11]:
print("Количество уникальных стран: ", len(user_data['country'].unique()))
print("Страны: ", ", ".join(user_data['country'].unique()))

Количество уникальных стран:  11
Страны:  Russia, Ukraine, Belarus, Azerbaijan, Kazakhstan, Finland, Turkey, Latvia, Cyprus, Switzerland, Estonia


Cтраны можно закодировать OneHotEncoding.

In [12]:
print("Количество уникальных групп: ", len(user_data['exp_group'].unique()))
print("Группы: ", user_data['exp_group'].unique())

Количество уникальных групп:  5
Группы:  [3 0 4 1 2]


In [13]:
print("Количество уникальных ОС: ", len(user_data['os'].unique()))
print("ОС: ", ", ".join(user_data['os'].unique()))

Количество уникальных ОС:  2
ОС:  Android, iOS


In [14]:
print("Количество уникальных источников: ", len(user_data['source'].unique()))
print("Источники: ", ", ".join(user_data['source'].unique()))

Количество уникальных источников:  2
Источники:  ads, organic


In [14]:
post_data.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [15]:
print("Количество уникальных рубрик: ", len(post_data['topic'].unique()))
print("Рубрики: ", ", ".join(post_data['topic'].unique()))

Количество уникальных рубрик:  7
Рубрики:  business, covid, entertainment, sport, politics, tech, movie


In [16]:
stop_words = pd.read_csv('stop_words.csv')['word'].values.tolist()

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_features=100,
                             ngram_range=(1, 2),
                             max_df=0.95,
                             min_df=0.01
                             )
X_tfidf = vectorizer.fit_transform(post_data['text']).toarray()

tfidf_df = pd.DataFrame(X_tfidf, columns=vectorizer.get_feature_names_out())
tmp_post_data = pd.concat([post_data, tfidf_df], axis=1)
tmp_post_data = tmp_post_data.drop(columns=['text'])

tmp_post_data.head()



Unnamed: 0,post_id,topic,000,10,2004,acting,action,actors,added,bad,...,watch,watching,week,win,won,work,world,year,years,young
0,1,business,0.0,0.0,0.5592,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252463,0.263513,0.0
1,2,business,0.48941,0.0,0.0,0.0,0.0,0.0,0.0,0.108691,...,0.0,0.0,0.0,0.0,0.0,0.0,0.412142,0.0,0.094734,0.0
2,3,business,0.405365,0.35583,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.201175,0.0,0.0,0.0
4,5,business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:

tmp_post_data = tmp_post_data.columns[1:].to_list()
categorical_features = ['city', 'source', 'os', 'gender', 'country', 'exp_group', 'hour'] + tmp_post_data
categorical_features

['city',
 'source',
 'os',
 'gender',
 'country',
 'exp_group',
 'hour',
 'topic',
 '000',
 '10',
 '2004',
 'acting',
 'action',
 'actors',
 'added',
 'bad',
 'based',
 'bbc',
 'big',
 'british',
 'called',
 'character',
 'characters',
 'company',
 'covid19',
 'day',
 'didnt',
 'director',
 'doesnt',
 'dont',
 'election',
 'england',
 'european',
 'expected',
 'fact',
 'film',
 'films',
 'find',
 'firm',
 'game',
 'games',
 'give',
 'good',
 'government',
 'great',
 'group',
 'hard',
 'high',
 'home',
 'https',
 'im',
 'ive',
 'labour',
 'life',
 'long',
 'lot',
 'love',
 'making',
 'man',
 'market',
 'million',
 'minutes',
 'money',
 'movie',
 'movies',
 'music',
 'news',
 'number',
 'part',
 'party',
 'people',
 'performance',
 'place',
 'play',
 'played',
 'players',
 'plot',
 'point',
 'public',
 'put',
 'real',
 'role',
 'scene',
 'scenes',
 'series',
 'set',
 'show',
 'star',
 'start',
 'story',
 'technology',
 'thought',
 'time',
 'times',
 'told',
 'top',
 'tv',
 'uk',
 'watch'

In [22]:
len(post_data.columns)

3

Рубрики можно закодировать OneHotEncoding. А вот на счёт текста надо подумать, как его закодировать.

In [23]:
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,target
0,2021-10-01 11:18:27,165889,7013,0
1,2021-10-01 11:19:59,165889,5018,0
2,2021-10-01 11:22:10,165889,4660,0
3,2021-10-01 11:23:47,165889,560,0
4,2021-10-01 11:24:11,165889,5540,0


In [19]:
# filtered_feed_data = feed_data[(feed_data['action'] == 'like') | ((feed_data['action'] == 'view') & (feed_data['target'] == 1))]
# grouped_data = filtered_feed_data.groupby(['user_id', 'post_id']).size().reset_index(name='count')

# sorted_data = filtered_feed_data.sort_values(by=['user_id', 'post_id', 'timestamp'])

# sorted_data['next_action'] = sorted_data.groupby(['user_id', 'post_id'])['action'].shift(-1)
# sorted_data['next_target'] = sorted_data.groupby(['user_id', 'post_id'])['target'].shift(-1)

# view_rows = sorted_data[(sorted_data['action'] == 'view') & (sorted_data['target'] == 1)]

# view_followed_by_like = view_rows[view_rows['next_action'] == 'like']

# all_views_followed_by_like = len(view_rows) == len(view_followed_by_like)

# print("Все действия 'view' с target=1 следуют перед 'like':", all_views_followed_by_like)

In [24]:
print("Количество уникальных часов: ", len(feed_data['timestamp'].dt.hour.unique()))
print("Часы: ", feed_data['timestamp'].dt.hour.unique())

Количество уникальных часов:  18
Часы:  [11  7 12 14 19 18 16 17  6  8 21 22 23 13  9 10 15 20]


In [25]:
print("Количество уникальных месяцев: ", len(feed_data['timestamp'].dt.month.unique()))
print("Месяцы: ", feed_data['timestamp'].dt.month.unique())

Количество уникальных месяцев:  3
Месяцы:  [10 11 12]


In [26]:
print("Количество уникальных лет: ", len(feed_data['timestamp'].dt.year.unique()))
print("Года: ", feed_data['timestamp'].dt.year.unique())

Количество уникальных лет:  1
Года:  [2021]


In [27]:
feed_data = feed_data.drop(columns=['timestamp'])

feed_data.head()

Unnamed: 0,user_id,post_id,target
0,165889,7013,0
1,165889,5018,0
2,165889,4660,0
3,165889,560,0
4,165889,5540,0


Получается, что мы можем убрать action и target, а просто заменить на не поставил лайк / поставил лайк.

Тперь необходимо поработать с признаками перед тем, как смердживать таблицы, чтобы в итоге получить что-то адекватное.

In [28]:
def merge_data(user_data, post_data, feed_data):

    merged_data = feed_data.merge(user_data, on='user_id', how='left')
    merged_data = merged_data.merge(post_data, on='post_id', how='left')
    
    return merged_data

In [29]:
merged_data = merge_data(user_data, post_data, feed_data)
merged_data.head()

Unnamed: 0,user_id,post_id,target,age,gender,city,country,exp_group,os,source,text,topic
0,165889,7013,0,19,1,Omsk,Russia,4,Android,organic,I am commenting on this miniseries from the pe...,movie
1,165889,5018,0,19,1,Omsk,Russia,4,Android,organic,"I like the film, it´s the best pirate-movie I ...",movie
2,165889,4660,0,19,1,Omsk,Russia,4,Android,organic,Is it full moon tonight? OH! It doesnt matter ...,movie
3,165889,560,0,19,1,Omsk,Russia,4,Android,organic,Director Nairs Vanity project\n\nIndian film d...,entertainment
4,165889,5540,0,19,1,Omsk,Russia,4,Android,organic,IT SHOULD FIRST BE SAID THAT I HAVE READ THE M...,movie


In [30]:
merged_data = merged_data.drop(columns=['user_id', 'post_id'])

In [31]:
merged_data.head(30)

Unnamed: 0,target,age,gender,city,country,exp_group,os,source,text,topic
0,0,19,1,Omsk,Russia,4,Android,organic,I am commenting on this miniseries from the pe...,movie
1,0,19,1,Omsk,Russia,4,Android,organic,"I like the film, it´s the best pirate-movie I ...",movie
2,0,19,1,Omsk,Russia,4,Android,organic,Is it full moon tonight? OH! It doesnt matter ...,movie
3,0,19,1,Omsk,Russia,4,Android,organic,Director Nairs Vanity project\n\nIndian film d...,entertainment
4,0,19,1,Omsk,Russia,4,Android,organic,IT SHOULD FIRST BE SAID THAT I HAVE READ THE M...,movie
5,0,19,1,Omsk,Russia,4,Android,organic,MLS navigates resuming the season in local mar...,covid
6,0,19,1,Omsk,Russia,4,Android,organic,R Balki tries to tell you a story that had bee...,movie
7,0,19,1,Omsk,Russia,4,Android,organic,this film explores if not creates a whole new ...,movie
8,0,19,1,Omsk,Russia,4,Android,organic,@shane25873 I nominated Dan because his is gre...,covid
9,0,19,1,Omsk,Russia,4,Android,organic,"Mauritius tourism, reeling from #COVID19, now ...",covid
