In [3]:
import os, sys
import dotenv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dotenv.load_dotenv('../.env')    
PATH_DATA = os.environ.get('PATH_OPENSKY') + os.environ.get('PATH_REL_DATA')
PATH_FEED_POSTS_LIKES = PATH_DATA + 'feed_posts_likes/'
list_files = os.listdir(PATH_FEED_POSTS_LIKES)
list_files

['AcademicSky.csv.gz',
 'GreenSky.csv.gz',
 'Science.csv.gz',
 '#Disability.csv.gz',
 'BookSky.csv.gz',
 'Game Dev.csv.gz',
 'Blacksky.csv.gz',
 'Political Science.csv.gz',
 '#UkrainianView.csv.gz',
 'News.csv.gz',
 "What's History.csv.gz"]

In [9]:
df = pd.read_csv(PATH_FEED_POSTS_LIKES + list_files[0], compression='gzip', header=None)
df.columns = ['liker_id', 'author_id', 'post_id', 'timestamp']
df

Unnamed: 0,liker_id,author_id,post_id,timestamp
0,523729,237383,38345536,202403241714
1,1163003,237383,38345536,202403182321
2,1797469,844345,46032441,202403190524
3,1095062,844345,46032441,202403190142
4,182888,844345,46032441,202403190909
...,...,...,...,...
4339,65931,331662,126854893,202402150420
4340,723382,331662,126854893,202402142241
4341,44787,238526,150562714,202402142115
4342,99560,238526,150562714,202402150329


In [11]:
df_list = []

for file in list_files:
    df = pd.read_csv(PATH_FEED_POSTS_LIKES + file, compression='gzip', header=None)
    df.columns = ['liker_id', 'author_id', 'post_id', 'timestamp']
    df['feed'] = file.split('.')[0]
    df_list.append(df)

df_feed_likes = pd.concat(df_list)
df_feed_likes

Unnamed: 0,liker_id,author_id,post_id,timestamp,feed
0,523729,237383,38345536,202403241714,AcademicSky
1,1163003,237383,38345536,202403182321,AcademicSky
2,1797469,844345,46032441,202403190524,AcademicSky
3,1095062,844345,46032441,202403190142,AcademicSky
4,182888,844345,46032441,202403190909,AcademicSky
...,...,...,...,...,...
2457,77142,61725,55090528,202403130820,What's History
2458,34638,61725,55090528,202403130225,What's History
2459,1644044,61725,55090528,202403130805,What's History
2460,65228,61725,55090528,202403130221,What's History


In [13]:
df_feed_likes.drop_duplicates(['liker_id', 'feed'], inplace=True)
df_feed_likes

Unnamed: 0,liker_id,author_id,post_id,timestamp,feed
0,523729,237383,38345536,202403241714,AcademicSky
1,1163003,237383,38345536,202403182321,AcademicSky
2,1797469,844345,46032441,202403190524,AcademicSky
3,1095062,844345,46032441,202403190142,AcademicSky
4,182888,844345,46032441,202403190909,AcademicSky
...,...,...,...,...,...
2451,236212,424024,32310989,202403131156,What's History
2457,77142,61725,55090528,202403130820,What's History
2458,34638,61725,55090528,202403130225,What's History
2459,1644044,61725,55090528,202403130805,What's History


In [14]:
df = pd.read_csv(os.environ.get('PATH_OPENSKY') + '/results/agg_user_data.csv', index_col=0)
df.reset_index(inplace=True, names=['user_id'])
df_sents = df[['user_id', 'mean_sent_score']]
df_sents

Unnamed: 0,user_id,mean_sent_score
0,288811,
1,527934,0.824409
2,1632691,
3,1909409,
4,2902501,
...,...,...
705678,495380,0.790667
705679,502241,0.800889
705680,1652601,0.770500
705681,1838341,


In [15]:
df_joined = pd.merge(df_feed_likes, df_sents, left_on='liker_id', right_on='user_id', how='left')
df_joined.dropna(inplace=True)
df_joined

Unnamed: 0,liker_id,author_id,post_id,timestamp,feed,user_id,mean_sent_score
1,1163003,237383,38345536,202403182321,AcademicSky,1163003.0,0.729610
4,182888,844345,46032441,202403190909,AcademicSky,182888.0,0.732326
8,3052834,844345,46032441,202403191655,AcademicSky,3052834.0,0.773000
12,31820,35328,215542741,202403220658,AcademicSky,31820.0,0.731241
14,919648,331738,86242767,202403190311,AcademicSky,919648.0,0.791748
...,...,...,...,...,...,...,...
375886,6580,500019,54479964,202403131108,What's History,6580.0,0.735468
375888,583450,6409,176742278,202403131100,What's History,583450.0,0.785945
375893,77109,77120,39918937,202403131221,What's History,77109.0,0.764845
375897,632845,89126,143561566,202403131404,What's History,632845.0,0.783881


In [16]:
df_joined.groupby('feed').agg({'mean_sent_score': ['mean', 'count', 'std']})

Unnamed: 0_level_0,mean_sent_score,mean_sent_score,mean_sent_score
Unnamed: 0_level_1,mean,count,std
feed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
#Disability,0.756686,992,0.036621
#UkrainianView,0.770484,1958,0.053346
AcademicSky,0.763665,750,0.041458
Blacksky,0.756569,34385,0.05337
BookSky,0.771387,580,0.047984
Game Dev,0.764092,808,0.045979
GreenSky,0.749468,1295,0.034847
News,0.755155,26932,0.058213
Political Science,0.752699,569,0.037634
Science,0.762509,29321,0.054677
