In [1]:
import pandas as pd

from pathlib import Path

repo_path = Path('/home/krajda/anticipatio/')

In [2]:
tweets = pd.read_pickle(repo_path / 'data/final.pkl')
docs = tweets['txt'].tolist()
docs[:10    ]

[' Zero.',
 ' Any that only needs one world.',
 'The many-worlds interpretation is the most egregious violation of Occam’s razor in history.',
 'The fight over what AIs say and do has just started, and will never end.',
 "A must-read: 's thoughts on a life in probabilistic AI.\n",
 'For better or worse, social media is where society decides what to pay attention to.',
 "If everyone who thinks they're the only anti-woke in their organization got together, they'd be the majority.",
 'What Google is singularly lacking is someone who knows how to play the PR game, like Sam Altman or Demis Hassabis (or Elon Musk or Steve Jobs).',
 ' Wokeism is least popular among Gen Z of all age cohorts, according to surveys.',
 ' Across the board.']

In [3]:
doc_topic = pd.read_csv(repo_path / 'data/topics_labels.csv')
doc_topic.sort_values('doc_id',inplace=True)
tweets['Topic'] = doc_topic['topic'].tolist()


# EMOTIONS

In [None]:
emotions = pd.read_pickle(repo_path / 'data/emotions.pkl')

In [None]:
tweets = tweets.reset_index()
tweets['id'] = tweets.index
emotions['id'] = emotions.index

tweets = tweets.join(emotions.set_index('id'), on='id')

In [None]:
agg = tweets.groupby('Topic').mean().drop(columns=['id', 'index'])

In [None]:
agg.to_csv(repo_path / 'data/to_paper/emotions_per_topic.csv')

# MOST ANTICIPATED EXAMPLES 

In [None]:
anticipated_representatives = []

examples = 20

for n in range(100):
    x = tweets[tweets['Topic'] == n].nlargest(examples, ['anticipation'])[['anticipation', 'txt']]
    
    a = x['anticipation'].tolist()
    t = x['txt'].tolist()

    for i in range(examples):
        anticipated_representatives.append({'topic': n, 'anticipation': a[i], 'text': t[i]})
    
    
pd.DataFrame(anticipated_representatives).to_csv(repo_path / 'data/to_paper/anticipated_representatives.csv')

In [None]:
anticipated_representatives

In [None]:
tweets['x'] = 1
topic_user = tweets[['Topic', 'user', 'x']].groupby(['Topic', 'user'], as_index=False)['x'].count().sort_values(by=['Topic','x'], ascending=False)

# KEY OPINION LEADERS

In [None]:
users_sum = topic_user.groupby('user')['x'].sum().reset_index()
users_sum.sort_values(by='x', ascending=False, inplace=True)
users_sum

In [None]:
topic_user['share'] = topic_user.apply(lambda x: x['x'] / users_sum[users_sum['user'] == x['user']]['x'].values[0], axis=1)

In [None]:
topic_user['user_sum'] = topic_user.apply(lambda x: users_sum[users_sum['user'] == x['user']]['x'].values[0], axis=1)

In [None]:
topic_user2 = topic_user[topic_user['user_sum'] > 10]

tu2 = topic_user2.groupby('user')['share'].max().reset_index().sort_values(by='share', ascending=False)

In [None]:
topic_user2[topic_user['user'].isin(tu2[tu2['share'] > 0.5]['user'].tolist())].groupby('user').max().sort_values(by='share', ascending=False).to_csv(repo_path / 'data/to_paper/focused_users.csv')

In [None]:
tu2[tu2['share'] > 0.5]['user'].tolist()

In [None]:
topic_nums = tweets['Topic'].value_counts().sort_index().tolist()
topic_nums

In [None]:
key_opinion_leaders = []

examples = 10

for n in [97]:
    x = topic_user[topic_user['Topic'] == n].nlargest(examples, ['x'])[['x', 'user']]
    
    a = x['x'].tolist()
    t = x['user'].tolist()

    for i in range(examples):
        key_opinion_leaders.append(
            {
                'topic': n, 
                'user': t[i],
                'tweets_count': a[i], 
                'topic_share': a[i] / topic_nums[n], 
                'user_share': a[i] / users_sum[users_sum['user'] == t[i]]['x'].values[0]
             }
            )
        
pd.DataFrame(key_opinion_leaders).to_csv(repo_path / 'data/to_paper/key_opinion_leaders.csv')


In [None]:
topic_user[topic_user['user'] == '@gp_pulipaka'].sort_values(by='share', ascending=False)

In [None]:
topic_nums[97]

In [None]:
a[0]/topic_nums[97]

# SENTIMENT

In [None]:
sentiment = pd.read_csv(repo_path/'data/sentiments.csv')
sentiment

In [None]:
tweets = tweets.reset_index()
tweets['id'] = tweets.index
sentiment['id'] = sentiment.index

tweets = tweets.join(sentiment.set_index('id'), on='id')
tweets

In [None]:
sentiment_stats = tweets.groupby('Topic').mean().drop(columns=['id', 'index', 'level_0'] + emotions.columns.tolist())

In [None]:
mapp = tweets.groupby('Topic', as_index=False).value_counts(subset=['sentiment_name'])

for i in range(100):
    sentiment_stats.loc[i, 'Negative'] = mapp[(mapp['Topic'] == i) & (mapp['sentiment_name'] == 'Negative')]['count'].values[0]
    sentiment_stats.loc[i, 'Neutral'] = mapp[(mapp['Topic'] == i) & (mapp['sentiment_name'] == 'Neutral')]['count'].values[0]
    sentiment_stats.loc[i, 'Positive'] = mapp[(mapp['Topic'] == i) & (mapp['sentiment_name'] == 'Positive')]['count'].values[0]
    


In [None]:
sentiment_stats = sentiment_stats.rename(columns={'sentiment':'mean_sentiment'})

In [None]:
sentiment_stats['emotional_ratio'] = sentiment_stats.apply(lambda x: (x['Positive'] + x['Negative'])/(x['Positive'] + x['Negative'] + x['Neutral']), axis=1)

In [None]:
sentiment_stats['pos_neg_ratio'] = sentiment_stats.apply(lambda x: (x['Positive'])/(x['Positive'] + x['Negative']), axis=1)

In [None]:
sentiment_stats['neg_pos_ratio'] = sentiment_stats.apply(lambda x: (x['Negative'])/(x['Positive'] + x['Negative']), axis=1)

In [None]:
sentiment_stats.to_csv(repo_path/'data/to_paper/sentiment_stats.csv')


In [None]:
sentiment_stats

# HASHTAGS

In [7]:
def get_hashtags(text):
    hashtags = re.findall(r"\B#\w*[a-zA-Z]+\w*", text)
    
    if hashtags == []:
        return None
    else:   
        return hashtags
                          
                          

In [8]:
import re
hashtags = pd.DataFrame(tweets['Topic'])
hashtags['hashtags'] = tweets['original_text'].apply(lambda x: get_hashtags(x))

In [9]:
hashtags.dropna(subset=['hashtags'], inplace=True)

In [22]:
c = hashtags.explode('hashtags')
c

Unnamed: 0,Topic,hashtags
551,16,#Davos
622,8,#GasStovesForever
1292,77,#DumpTrump
1565,72,#MeToo
2301,34,#MeToo
...,...,...
1458011,29,#gotech
1458012,74,#carrefour
1458012,74,#bringo
1458012,74,#gotech


In [54]:
z = c.groupby('Topic', as_index=False)['hashtags'].value_counts(sort=True, ascending=False).groupby('Topic').head(3)

In [55]:
z

Unnamed: 0,Topic,hashtags,count
0,0,#Environment,13574
1,0,#ClimateChange,4152
2,0,#Sustainability,3465
3305,1,#CES2022,47
3306,1,#custserv,40
...,...,...,...
208010,98,#CX,364
208011,98,#Retail,175
209766,99,#CoronaVirus,3499
209767,99,#Quarantine,2870


In [56]:
z.to_csv(repo_path / 'data/to_paper/hashtags.csv')

In [61]:
c['hashtags'].value_counts(sort=True, ascending=False).head(500).to_csv(repo_path / 'data/to_paper/hashtags_all.csv')