# Import Statements


In [5]:
import requests
import time
import pandas as pd
import csv
from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer
import preprocessor as p
from geopy.geocoders import Nominatim
from os import listdir
import os
import requests
import json
from tqdm import tqdm

# Data Collection


In [18]:
APP_KEY = "xxxxxxxxxxxxx"
APP_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
SAMPLES_DIR = "./data"
MAX_SAMPLES = 2 # Limit of the top n lines to read in each file sample

In [5]:
trust_website = pd.read_csv("websites.csv")
trust_websites_map: map = {}
sites = list(trust_website["site"])
site_scores = list(trust_website["trust"])
for i in range(0, len(sites)):
    trust_websites_map.setdefault(sites[i], site_scores[i])

In [6]:
def get_tweet_trust_score(external_url: str):
    if external_url:
        return trust_websites_map.get(external_url, 0)
    else:
        return 0

In [7]:
def get_bearer_token():
    url = 'https://api.twitter.com/oauth2/token'
    auth = (APP_KEY, APP_SECRET)
    data = {'grant_type': 'client_credentials'}

    response = requests.post(url, auth=auth, data=data)

    if response.status_code == 200:
        return response.json()["access_token"]

In [8]:
def get_external_url_from_tweet(tweet)->set:
    external_urls_objects = tweet['entities'].get("urls", None)
    if external_urls_objects:
        for url_object in external_urls_objects:
            try:
                url = url_object['display_url'].split('/', 1)[0]
                if "twitter.com" not in url:
                    return url
            except Exception as ex:
                continue
        return None;
    else:
        return None

In [9]:
def get_metrics_from_user(user):
    followers_count = user["public_metrics"].get("followers_count", None)
    tweet_count = user["public_metrics"].get("tweet_count", None)
    
    return (followers_count, tweet_count)

In [10]:
def get_hashtags_texts(hashtags: list):
    hashtags_list = []
    for hashtag in hashtags:
        hashtags_list.append(hashtag.get("tag", None))
        
    return hashtags_list

In [11]:
def get_location_from_user(user):
    
    location = user.get("location", None)
    
    if location:
        geolocator = Nominatim(user_agent="geoapiExercises")
        parsed_location = geolocator.geocode(location, addressdetails=True)
        if parsed_location:
            address = parsed_location.raw["address"]
            country = address.get("country", None)
            state = address.get("state", None)  
            
            return (location, country, state)
    else:
        return (None, None, None)

In [12]:
def get_user(user_id: str):
    url = 'https://api.twitter.com/2/users'
    params = {
        'ids': user_id,
        'user.fields': 'location,public_metrics'
    }
    headers = {
        'Authorization': f'Bearer {get_bearer_token()}'
    }

    response = requests.get(url, params=params, headers=headers)  

    # Check if the request was successful
    if response.status_code == 200 and "errors" not in response.json():
        data = response.json()["data"][0]
        return data
    elif response.status_code == 429:
        print("Falling asleep for 15 minutes...")
        time.sleep(960)
        return get_user(user_id)
    else:
        print(f"Request failed with status code {response.status_code}")
        print(response.text)

In [13]:
def get_tweet(tweet_id: str):
    url = 'https://api.twitter.com/2/tweets'
    params = {
        'ids': tweet_id,
        'tweet.fields': 'referenced_tweets,author_id,created_at,public_metrics,entities'
    }
    headers = {
        'Authorization': f'Bearer {get_bearer_token()}'
    }

    response = requests.get(url, params=params, headers=headers)

    # Check if the request was successful
    if response.status_code == 200 and "errors" not in response.json():
        data = response.json()["data"][0]
        if "referenced_tweets" in data:
            return get_tweet(data["referenced_tweets"][0]["id"])
        else:
            return data
    elif response.status_code == 429:
        print("Falling asleep for 15 minutes...")
        time.sleep(960)
        return get_tweet(tweet_id)
    else:
        print(f"Request failed with status code {response.status_code}")
        print(response.text)


In [22]:
def get_sample_row(data: map):
    try:
        created_at = data["created_at"]
        tweet_id = data["id"]
        user_id =data["author_id"]
        text = data["text"]
        retweets_count = data["public_metrics"]["retweet_count"]
        likes_count = data["public_metrics"]["like_count"]
        replies_count = data["public_metrics"]["reply_count"]
        hashtags = data["entities"].get("hashtags", None)
        hashtags_text = None
        if hashtags:
            hashtags_text = get_hashtags_texts(hashtags)
        url = get_external_url_from_tweet(data)
        tweet_trust_score = get_tweet_trust_score(url)
        user = get_user(user_id)
        location, country, state = get_location_from_user(user)
        followers_count, tweet_count = get_metrics_from_user(user)

        row = {
            "createdAt": [created_at],
            "tweetId": [tweet_id],
            "userId": user_id,
            "user_followers_count": [followers_count],
            "user_tweet_count": [tweet_count],
            "location": [location],
            "country": [country],
            "state": [state],
            "text": [text],
            "retweetsCount": [retweets_count],
            "likesCount": [likes_count],
            "repliesCount": [replies_count],
            "hashtags": [hashtags_text],
            "url": [url],
            "tweetTrustScore": [tweet_trust_score]
        }
        
        return row
    except Exception as ex:
        print(ex)
        return None  

In [15]:
def get_samples_files():
    return listdir(SAMPLES_DIR)

In [20]:
def collect_rows_from_tweetIds(df, tweetIds: list):
    for i, tweet_id in enumerate(lines):
        tweet_id = tweet_id.strip()
        data = get_tweet(tweet_id)
        if data:
            row = get_sample_row(data)
            if row:
                new_df = pd.DataFrame(row)
                df = pd.concat([df, new_df], ignore_index=True)
        if i == MAX_SAMPLES: break
    return df

In [23]:
samples_files = get_samples_files()
df = pd.DataFrame()

for file_name in samples_files:
    sample_file_path = os.path.join(SAMPLES_DIR, file_name)
    print("File "+file_name+" is being processed...")
    with open(sample_file_path) as sample_file:
        lines = sample_file.readlines()
        df = collect_rows_from_tweetIds(df, lines)

File 2023-02_tweet_ids.txt is being processed...
'entities'
File 2023-01_tweet_ids.txt is being processed...


In [24]:
df.to_csv("collected_samples.csv", quoting=csv.QUOTE_ALL)

# Sentiment Analysis


In [2]:
def preprocess_tweet(tweet: str):
    tweet = p.clean(tweet)
    tweet = tweet.replace('\d+', '')
    tweet = tweet.lower()
    
    return tweet

In [14]:
labels = []
model = AutoModelForSequenceClassification.from_pretrained("/media/vangelis/vag/sentiment_amalysis_final")
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

SAMPLES_DIR = "/home/vangelis/Downloads"
data = pd.read_csv(os.path.join(SAMPLES_DIR, "tweets.csv"), index_col=False)
texts = list(data["text"])

In [15]:
for i in tqdm(range(0, len(texts))):
    text = texts[i]
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    text = preprocess_tweet(text)
    result = classifier(text)[0]["label"]
    label = 1 if result=='LABEL_1' else 0
    labels.append(label)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9665/9665 [35:23<00:00,  4.55it/s]


In [16]:
data["sentimentLabel"] = labels

# Subjectivity Score


In [5]:
SAMPLES_DIR = "/home/vangelis/PycharmProjects/veritas-observatory/veritas-api/data"
data = pd.read_csv(os.path.join(SAMPLES_DIR, "verity_dataset.csv"), index_col=False)
data = data.drop("Unnamed: 0", axis=1)

In [18]:
def analyze_subjectivity(text):
    url = "http://snf-38872.ok-kno.grnetcloud.net:5051/subjectivity"
    data = {"text": text}

    try:
        response = requests.post(url, json=data)

        # Check if the request was successful (HTTP status code 200)
        if response.status_code == 200:
            result = response.json()
            return result["result"]
        else:
            print(f"HTTP request failed with status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [20]:
subjectivity_labels = []
for i in tqdm(range(0, len(texts))):
    text = texts[i]
    score = analyze_subjectivity(text)
    subjectivity_labels.append(score)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9665/9665 [31:43<00:00,  5.08it/s]


In [21]:
for result in subjectivity_labels:
    if result is None:
        print("error")

In [22]:
data.head()

Unnamed: 0,createdAt,tweetId,userId,user_followers_count,user_tweet_count,location,country,state,text,retweetsCount,likesCount,repliesCount,hashtags,url,tweetTrustScore,sentimentLabel
0,2021-01-03T16:52:05.000Z,1345774948086902787,844486678324658176,601753,11601,"KwaZulu Natal, South Africa",South Africa,KwaZulu-Natal,We have always said the second wave is going t...,265,1022,816,['VaccineStrategy'],,0,0
1,2021-01-02T19:30:27.000Z,1345452412723752962,84035041,207796,40672,,,,So all the bad covid things me &amp; others we...,4129,14979,257,,,0,0
2,2021-01-03T16:46:34.000Z,1345773561269649408,17143007,91401,22769,"Manhattan, NY",United States,New York,Vaccination in New York City is basically only...,4246,21606,765,,,0,0
3,2021-01-03T17:24:41.000Z,1345783152158121984,232268199,2275739,30258,New York,United States,New York,COVID showed that racism is a public health cr...,718,5728,984,,,0,0
4,2021-01-04T00:35:09.000Z,1345891482385997831,1652541,25716233,1016058,Around the world,Canada,British Columbia,U.S. may cut some Moderna vaccine doses in hal...,58,105,39,,reut.rs,0,0


In [23]:
data["subjectivityLabel"] = subjectivity_labels

In [25]:
data.to_csv("final_dataset2.csv", index=False)