## Import libraries

In [2]:
from pytwitter import Api
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import math
from pytwitter.models import User;
from pytwitter.models import Tweet;


In [150]:
api = Api(bearer_token="AAAAAAAAAAAAAAAAAAAAAMhqlAEAAAAA4Pqzn354Z5nlkP5lKaW98vzlVlA%3D7GIA03xacVKdFYTFg7qmgvWTZThpa2FFd4SNPUqP7uPK7Xjue5")

public_tweets = api.search_tweets(query="lady gaga lang:pt has:hashtags -is:retweet has:media", expansions=["referenced_tweets.id.author_id","in_reply_to_user_id","attachments.media_keys","author_id","entities.mentions.username"], 
                                  user_fields=["created_at","entities","id","location","name","pinned_tweet_id","profile_image_url","protected","public_metrics","url","username","verified"],
                                  tweet_fields=["attachments","author_id","context_annotations","created_at","entities","geo","in_reply_to_user_id","lang","public_metrics","reply_settings","source"], max_results=100, query_type='recent')

# Analysis

## User's influence and reputation Analysis

In [122]:
def get_sentiment_score(text):
    "float: The sentiment score between -1.0 (negative) and 1.0 (positive)"
    blob = TextBlob(text)
    return blob.sentiment.polarity

### Influence Calculus

In [118]:
def calculate_influence(user: User):
    follower_count = user.public_metrics.followers_count

    # Get user's tweet count and average engagement rate
    tweets = api.get_timelines(user.id, max_results=50, tweet_fields=["attachments","author_id","context_annotations","created_at","entities","geo","in_reply_to_user_id","lang","public_metrics","reply_settings","source"])
    tweet_count = len(tweets.data)
    total_engagement = 0
    
    for tweet in tweets.data:
        total_engagement += tweet.public_metrics.like_count + tweet.public_metrics.retweet_count + tweet.public_metrics.quote_count + tweet.public_metrics.reply_count
        
    if tweet_count > 0:
        avg_engagement_rate = total_engagement / (tweet_count * follower_count) if total_engagement > 0 and tweet_count > 0 and follower_count > 0 else 0
    else:
        avg_engagement_rate = 0

    # Calculate influence score
    influence_score = math.log(follower_count + 1, 10) * (avg_engagement_rate + 1)
    
    return influence_score

### Reputation Calculus

In [119]:
def calculate_reputation(user: User):
    # Get user's recent mentions and replies
    mentions = api.search_tweets(query=f"@{user.username}", max_results=50)
    replies = api.search_tweets(query=f"to:{user.username}", max_results=50)

    # Calculate reputation score based on sentiment analysis of mentions and replies
    positive_sentiments = 0
    negative_sentiments = 0
    
    for mention in mentions.data:
        if mention.author_id != user.id:
            sentiment = get_sentiment_score(mention.text)
            if sentiment > 0:
                positive_sentiments += 1
            elif sentiment < 0:
                negative_sentiments += 1
                
    for reply in replies.data:
        if reply.author_id != user.id:
            sentiment = get_sentiment_score(reply.text)
            if sentiment > 0:
                positive_sentiments += 1
            elif sentiment < 0:
                negative_sentiments += 1
                
    if (positive_sentiments + negative_sentiments) > 0:
        reputation_score = positive_sentiments / (positive_sentiments + negative_sentiments)
    else:
        reputation_score = 0

    # Return influence and reputation scores
    return reputation_score + user.public_metrics.listed_count

## Network Analysis

## Text Analysis

### Text Cleaning

In [None]:
import re
import string
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw')
nltk.download('averaged_perceptron_tagger')

stemmer = SnowballStemmer('portuguese')
stop_words = set(stopwords.words('portuguese'))

def remove_urls(text):
    return re.sub(r"http\S+", "", text)

def remove_mentions(text):
    return re.sub(r"@\S+", "", text)

def remove_hashtags(text):
    return re.sub(r"#\S+", "", text)

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return word_tokenize(text, language='portuguese')

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('portuguese'))
    return [token for token in tokens if not token in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token, wordnet.VERB)
        if lemma == token:
            lemma = lemmatizer.lemmatize(token, wordnet.NOUN)
        if lemma == token:
            lemma = lemmatizer.lemmatize(token, wordnet.ADJ)
        if lemma == token:
            lemma = lemmatizer.lemmatize(token, wordnet.ADV)
        lemmas.append(lemma)
    return lemmas

def synonymize(tokens):
    synonyms = []
    for token in tokens:
        synsets = wordnet.synsets(token, lang='por')
        if synsets:
            synset = synsets[0]
            for lemma in synset.lemmas(lang='por'):
                synonym = lemma.name().lower()
                if synonym not in synonyms and synonym != token:
                    synonyms.append(synonym)
    return synonyms

def polysemmize(tokens):
    for word in tokens:
       pos = nltk.pos_tag(tokens)[0][1][0].lower()
       if pos not in ['n', 'v']:
           continue
        # Use simple_lesk to disambiguate the sense of the word
       synset = nltk.wsd.lesk(tokens, word, pos=pos)
    
       if synset is not None:
           # Replace the token with the lemma of the most likely sense
           word = synset.lemmas()[0].name().lower()
            
    return tokens

def preprocess_tweet(text):
    text = text.lower()
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_punctuation(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    tokens = polysemmize(tokens)
    tokens.extend(synonymize(tokens))
    return set(tokens)

### Calculate Reputation and Influence for all users

In [153]:
usersByUsername = {}
usersById = {}
for user in public_tweets.includes.users:
    result = [calculate_influence(user), calculate_reputation(user)]
    usersByUsername[user.username] = result
    usersById[user.id] = result

#### Calculates the social capital from a tweet

In [159]:
import datetime
import emoji

def calculate_social_capital(tweet: Tweet):
    # Extract relevant information from the tweet
    tokens = preprocess_tweet(tweet.text)
    urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', tweet.text)
    emojis = emoji.emoji_count(tweet.text)
    likes = tweet.public_metrics.like_count
    retweets = tweet.public_metrics.retweet_count
    replies = tweet.public_metrics.reply_count
    quotes = tweet.public_metrics.quote_count
    created_at = tweet.created_at
    hashtags = len(re.findall(r'#(\w+)', tweet.text))
    
    num_medias = 0
    if tweet.attachments is not None and tweet.attachments.media_keys is not None:
        for attachment in tweet.attachments.media_keys:
            num_medias += 1

    # Calculate the length of the tweet in characters
    length = len(tweet.text)

    # Calculate the sentiment score of the tweet
    sentiment_score = get_sentiment_score(tweet.text)

    # Calculate the diversity score of the tweet
    diversity_score = calculate_diversity_score(tokens)

    # Calculate the number of resources in the tweet
    num_resources = len(urls) + emojis + num_medias

    # Calculate the recency score of the tweet
    recency_score = calculate_recency_score(created_at)

    # Calculate the social capital score of the tweet
    engagement = (likes + replies + quotes)
    social_capital_score = (retweets if retweets > 0 else 1) * ((engagement + num_resources + diversity_score + len(tokens) + hashtags + length + math.log(sum(usersById[tweet.author_id]),10)) * recency_score)

    return {'tweet': tweet, 'score': social_capital_score } 

def get_sentiment_score(text):
    # Use a sentiment analysis library or model to calculate the sentiment score of the tweet text
    # For example, using TextBlob library
    from textblob import TextBlob

    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity

    return sentiment_score

def calculate_diversity_score(words):
    # Use a measure of lexical diversity to calculate the diversity score of the tweet text
    # For example, using the type-token ratio (TTR) metric
    num_words = len(words)
    num_unique_words = len(set(words))
    diversity_score = num_unique_words / num_words

    return diversity_score

def calculate_recency_score(created_at):
    # Calculate the recency score of the tweet based on its age
    # For example, using a linear decay function with a half-life of one day
    tweet_date = datetime.datetime.strptime(created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
    now = datetime.datetime.now()
    age_in_seconds = (now - tweet_date).total_seconds()
    half_life_in_seconds = 86400 # One day in seconds
    recency_score = 0.5 ** (age_in_seconds / half_life_in_seconds)

    return recency_score


#### V3

In [78]:
import re
import datetime

def get_tweet_social_capital(tweet: Tweet):
    # Get tweet text
    text = tweet.text
    
    # Get number of likes and retweets
    likes = tweet.public_metrics.like_count
    retweets = tweet.public_metrics.retweet_count
    replies = tweet.public_metrics.reply_count
    quotes = tweet.public_metrics.quote_count
    hashtags = len(re.findall(r'#(\w+)', text))
    words = len(preprocess_tweet(text))
    
    words = words*5 if hashtags > words else words
    
    # Get tweet creation time
    created_at = tweet.created_at
    created_at = datetime.datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%S.%fZ")
    now = datetime.datetime.utcnow()
    age = (now - created_at).total_seconds() / 3600 # tweet age in hours
    
    # Calculate recency score
    recency_score = math.exp(-0.1 * age)
    
    # Get URLs in tweet
    urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text)
    num_urls = len(urls)
    
    # Get number of emojis in tweet
    emojis = re.findall(r'[^\w\s,]', text)
    num_emojis = len(emojis)
    
    # Get number of photos and videos in tweet
    num_medias = 0
    
    if tweet.attachments is not None and tweet.attachments.media_keys is not None:
        for attachment in tweet.attachments.media_keys:
            num_medias += 1
            
    # mention_score = 0
    # if tweet.entities.mentions is not None:
    #     for mention in tweet.entities.mentions:
    #         mention_score += sum(usersByUsername[mention.username])
    
    # Calculate media score
    media_score = 0.2 * num_medias
    
    # Calculate URL score
    url_score = 0.2 * num_urls
    
    # Calculate emoji score
    emoji_score = 0.5 * num_emojis
    
    # Calculate engagement score
    engagement_score = likes + retweets + replies + quotes
    
    # Calculate social capital
    social_capital = recency_score * (media_score + url_score + emoji_score + engagement_score + hashtags + words + sum(usersById[tweet.author_id]))
    
    return {'tweet': tweet, 'score': social_capital }

# Ranking tweets list from social capital score

In [160]:
ranking = {}
for tweet in public_tweets.data:          
    ranking[tweet.id] = calculate_social_capital(tweet)
    
ranked = dict(sorted(ranking.items(), key=lambda item: item[1]['score'], reverse=True))

for tweetId in ranked: 
    print(ranking[tweetId]['score'], ranking[tweetId]['tweet'].id + "    ----  " + ranking[tweetId]['tweet'].text)

58922.211762396226 1628530071416832002    ----  A COR DA ROMÃ (Dir. Sergei Parajanov, 1969) x 911 (videoclipe de Lady Gaga, dir. Tarsem Singh, 2020). #ParalelosCinematográficos https://t.co/7jlesZ89Ce
10189.754463598178 1627465200076038144    ----  Lady Gaga é chamada de gênia da música durante homenagem à Rainha Elizabeth II no #BAFTAs

https://t.co/sLwqocWWrN
5560.101773731016 1627398502429999115    ----  Uma imagem da Lady Gaga foi utilizada durante o tributo para a Elizabeth II no BAFTA Awards 2023.

#EEBAFTAs https://t.co/61kDczLGQQ
5165.5507650047575 1628170227555921926    ----  E foi bem aqui que Lady Gaga mostrou que a arte podia ser POP! #artpop
https://t.co/znx6mm20W7
5056.620466130852 1626670656061333504    ----  Infelizmente, Bruno Gaga apertou o botão e desistiu do programa. Desejamos todo do sucesso do mundo para ele aqui fora! Brilha, @brunornogueira! Já dizia Lady Gaga: “Nasce uma estrela!” ⭐️🤍 #BBB23 https://t.co/DPpd0ElX7n
465.454425077217 1626606681663717376    ---- 

# Results

## Influence vs Interactions

## Reputation vs Interactions

## Histogram Interactions

## Evaluation Metrics