# Collect Tweets - Fake sources
This notebook collects tweets from users that follow Fake News sources

### Reference for recommended manipulations for text analysis:
https://towardsdatascience.com/twitter-sentiment-analysis-using-fasttext-9ccd04465597

In [1]:
import pandas as pd
import tweepy
import json
from tqdm import tqdm
import re
import emoji

In [None]:
tqdm.pandas()
remove_punctuations = False

sources_account_list = ["usa_supreme","rightsidenews","JudicialWatch","100PercentFedUp","ActivistPost","AllenBWest"]

## Initialize twitter API

In [None]:
with open("../config.toml", "rb") as f:
    config = tomli.load(f)
    

# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(config["twitter_api"]["consumer_key"],
                           config["twitter_api"]["consumer_secret"])
  
# set access to user's access key and access secret 
auth.set_access_token(config["twitter_api"]["access_token"], 
                      config["twitter_api"]["access_token_secret"])
  
# calling the api 
api = tweepy.API(auth)


## Collect list of twitter IDs of users that follow the fake news sources

In [20]:
user_to_collect = pd.DataFrame()
for user in sources_account_list:
    li = api.get_follower_ids(screen_name=user)
    data = {"user_id": li,
        "because_it_follows":[user] * len(li) 
    }
    temp_df = pd.DataFrame(data)
    user_to_collect = user_to_collect.append(temp_df)
    
user_to_collect_2 = user_to_collect.groupby(by='user_id').count().reset_index().sort_values(by='because_it_follows', ascending=False)
user_to_collect_2 = user_to_collect_2[user_to_collect_2['because_it_follows']>1]
        

## For each user from the previous list, collect the last 200 tweets

In [24]:
tweets_df = pd.DataFrame()
for user in tqdm(user_to_collect_2.user_id.unique()):
    for attempt in range(10):
        try:
            tweets = api.user_timeline(user_id = user, count = 200, tweet_mode="extended", include_rts = False)
            for instance in tweets:
                tweets_df = tweets_df.append({'user_id':user, 'text':instance.full_text, 'time':instance.created_at}, ignore_index=True)

        except tweepy.TweepyException as e:
                break
        break

100%|██████████| 40/40 [00:30<00:00,  1.31it/s]


## Process tweet's text for future ML parsing

In [26]:
def process_tweet(tweet):
    
    #Remove hashtags
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
    
    #Remove URLs
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    
    #remove punctuations - if needed
    if remove_punctuations:
        tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
    
    #to lower
    tweet = tweet.lower()
    
    #translate emojis
    tweet = emoji.demojize(tweet)
    
    return tweet
    
tweets_df['processed_text'] = tweets_df.text.progress_apply(process_tweet)

100%|██████████| 2752/2752 [00:01<00:00, 1993.54it/s]


In [31]:
tweets_df = tweets_df[['user_id','time','text','processed_text']]

#Localize time in order to support saving in Excel
tweets_df['date'] = tweets_df['time'].apply(lambda a: pd.to_datetime(a)) 
tweets_df.to_excel("sample_tweets_2.xlsx", encoding='utf-8')
user_to_collect.to_excel("users_list_2.xlsx", encoding='utf-8')

## Additional Parsing (experimental stuff)

In [9]:
from langdetect import detect
from tqdm import tqdm
tqdm.pandas()

tweets_df = pd.read_excel('sample_tweets_2.xlsx')

In [18]:
tweets_df = tweets_df[~tweets_df.processed_text.isna()]

'four interlocking mega-corporations comprise the corporate media who have vilified alternative opinions as "conspiracy theories" and have divided the universe into the simplistic split and projected "reality" of juvenal\'s "bread and circuses."'

In [21]:
def lang_detect(txt):
    try:
        return detect(txt)
    except:
        return ""
        


tweets_df['language'] = tweets_df.processed_text.progress_apply(lang_detect)

100%|██████████| 2409/2409 [00:11<00:00, 204.61it/s]


In [30]:
from translate import Translator

# the below function takes text and translates it from arabic to english
def translate_arabic( row ):
    try:
        if row['language'] == 'ar':
            translator= Translator(from_lang="arabic",to_lang="english")
            return translator.translate(row['processed_text'])
        else:
            return ""
    except:
        return ""

tweets_df["ar_to_en"] = tweets_df.progress_apply(translate_arabic, axis=1)

100%|██████████| 2409/2409 [00:18<00:00, 129.00it/s]


In [32]:
tweets_df.to_excel("sample_tweets_2.xlsx", encoding='utf-8')