# Mining Twitter with Python

In [None]:
# Libraries

import pandas as pd
import tweepy as tweepy # https://github.com/tweepy/tweepy
import json

In [None]:
# Twitter developer credentials - the text file does not require typing the keys and tokens everytime you run the code

with open('twitter_keys_tokens.txt', 'r') as tfile:
    consumer_key = tfile.readline().strip('\n')
    consumer_secret = tfile.readline().strip('\n')
    access_token = tfile.readline().strip('\n')
    access_token_secret = tfile.readline().strip('\n')

In [None]:
# Authentication - keys and tokens

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [None]:
# Search Query - The query discards retweets  http://docs.tweepy.org/en/latest/api.html#

list_twitter_query = ["#BolsonaroNaOnu -filter:retweets", "'Parabéns Presidente' -filter:retweets",  "'Assembleia Geral da ONU' -filter:retweets", "'MIL DÓLARES' -filter:retweets",  "Cristofobia -filter:retweets",  "#BolsonaroMentiroso -filter:retweets", "Bozo -filter:retweets",  "#BolsonaroOrgulhaOBrasil -filter:retweets"]
list_result_type = ['mixed', 'recent', 'popular']

# Defining the search period (9-days window)
#list_final_date = ['2020-09-22', '2020-09-23']
number_tweets = '100'

for j in range(len(list_result_type)):
  for k in range(len(list_twitter_query)):

  # "Results" get all the tweets and their metadata
    results = api.search(list_twitter_query[k], lang = 'pt', since = '2020-09-22', until = '2020-09-24', result_type = list_result_type[j], count = number_tweets, tweet_mode = 'extended')
    
    for m in range(len(results)):
  
      aux = results[m]
      json_string = json.dumps(aux._json, ensure_ascii=False)
      dictionary = json.loads(json_string)

      if 'extended_entities' in dictionary:
        dictionary.pop('extended_entities')

      if 'possibly_sensitive' in dictionary:
        dictionary.pop('possibly_sensitive')

      if 'retweeted_status' in dictionary:
        dictionary.pop('retweeted_status')

      if k == 0 and j == 0:
        df = pd.DataFrame.from_dict(dictionary, orient='index')

      else:
        df_aux = pd.DataFrame.from_dict(dictionary, orient='index')
        df = pd.concat([df,df_aux], axis = 1)

df_transposed = df.transpose()
df_transposed.index = range(len(df_transposed))
df_transposed.to_excel("complete_output.xlsx")

# [Appendix] Overall comments

The results have the format "SearchResults".
However, this format does not allow operations similar to arrays, strings, dataframes, dictionaries, JSON, series.
Thus, there are the following steps to make sure that the final file will provide a useful dataset with the text and metadata.

1. Each row (status) contains an embedded JSON.
Therefore, these rows will firstly be extracted to a JSON object, in order to perform further processes.

2. JSON to dictionary.

3. There is an error concerning the Tweepy library. The function does not gather the same data to all of the instances (tweets).
Ergo, there is an intervention so that it is possible to concatenate these tweets into a dataframe.

4. A dataframe is created and concatenated, in order to make easier the process of saving a spreadsheet (.xlsx).

5. It is necessary to transpose the dataframe, so that the instances become the rows and the metadata become the columns.
This change makes the dataframe more friendly to preprocessing/ML algorithms.

6. An .xlsx file keeps the original format of the dataframe.
For example, a .csv file would compromise the content of the tweets.