# Getting tweets from Twitter API

This fists part contains a lot of code from [Twitter API Notebook](https://github.com/ivanovitchm/network_analysis/blob/main/week_08/Twitter.ipynb) from [Network Analysis Class](https://github.com/ivanovitchm/network_analysis) by [Ivanovitch Silva](https://github.com/ivanovitchm).

## Dependencies

In [None]:
!pip install Twython

Collecting Twython
  Downloading twython-3.9.1-py3-none-any.whl (33 kB)
Installing collected packages: Twython
Successfully installed Twython-3.9.1


In [None]:
from twython import Twython, TwythonError
from pprint import pprint
import itertools
import json
import time

## Twitter API Authentication


In order to authenticate with Twitter, we'll provide the app details and  ask for a one-time authorization URL to authenticate your user with this app.

Copy and paste the API key and secret from your Twitter app into a file named <font color="red">keys.txt</font>. The first line is the API_KEY and the second line of the file is API_SECRET_KEY. For example, a template for the <font color="red">keys.txt</font>: 

```python
df6cf09894907b92f3ea749ef
d19c40cbb184f72055c806f107b5158d023a43eb7d8921a0d0
```

In [None]:
# open the keys file
my_file = open("keys.txt", "r")

# read the raw data
content = my_file.read()

# split all lines by  newline character
API_KEY, API_SECRET_KEY = content.split("\n")

# close the file
my_file.close()

Executing the cell should then print out a clickable URL. This link is unique and will work **exactly** once. <font color="red"> Visit this URL, log into Twitter, and then copy the verifier pin that is given to you so as to paste it in the next step</font>.

In [None]:
twitter = Twython(API_KEY, API_SECRET_KEY)

authentication_tokens = twitter.get_authentication_tokens()
print(authentication_tokens['auth_url'])

https://api.twitter.com/oauth/authenticate?oauth_token=1VRgxQAAAAABTaK-AAABe9vC6Rg



That verifier PIN goes into the next cell. This will be different every time you run these steps. The `authentication_tokens` include temporary tokens that go with this verifier PIN; by submitting these together, we show Twitter that we are who we say we are.

In [None]:
# Replace the verifier with the pin number obtained with your web browser in the previous step
VERIFIER = '8289025'

twitter = Twython(API_KEY, API_SECRET_KEY,
                  authentication_tokens['oauth_token'],
                  authentication_tokens['oauth_token_secret'])

authorized_tokens = twitter.get_authorized_tokens(VERIFIER)

In [None]:
twitter = Twython(API_KEY, API_SECRET_KEY,
                  authorized_tokens['oauth_token'],
                  authorized_tokens['oauth_token_secret'])

## Get argentinian tweets

In [None]:
assert twitter.get_application_rate_limit_status()['resources']["search"]['/search/tweets']['remaining'] >= 180, "To continue, you must have at least 180 requisitions remaining."

params = {
	'q': 'brasil argentina',
	'lang': 'es',
	'result_type': None,
	'until': '2021-09-06',
    'count': 100
}

#
# Get recent tweets
#
NUM_TWEETS_TO_FETCH = 14000
params['result_type'] = 'recent'

cursor = twitter.cursor(twitter.search, **params)
search_tweets = list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH))
print('recents:', len(search_tweets))

#
# Get mixed tweets
#
NUM_TWEETS_TO_FETCH = 3000
params['result_type'] = 'mixed'

cursor = twitter.cursor(twitter.search, **params)
search_tweets.extend(list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH)))
print('recent + mixed:', len(search_tweets))

#
# Get popular tweets
#
NUM_TWEETS_TO_FETCH = 100
params['result_type'] = 'popular'

cursor = twitter.cursor(twitter.search, **params)
search_tweets.extend(list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH)))
print('recents + mixed + popular:', len(search_tweets))

print('Saving...')
with open(f'argentinian_rawdata.json', 'w') as fp:
    json.dump(search_tweets, fp,  indent=4)
print('Saved!')

recents: 14000
recent + mixed: 17000
recents + mixed + popular: 17030


In [None]:
# Sleep for 15min to renew remaining requisitions
time.sleep(901)

## Get brazilian tweets

In [None]:
assert twitter.get_application_rate_limit_status()['resources']["search"]['/search/tweets']['remaining'] >= 180, "To continue, you must have at least 180 requisitions remaining."

params = {
	'q': 'brasil argentina',
	'lang': 'pt',
	'result_type': None,
	'until': '2021-09-06',
    'count': 100
}

#
# Get recent tweets
#
NUM_TWEETS_TO_FETCH = 14000
params['result_type'] = 'recent'

cursor = twitter.cursor(twitter.search, **params)
search_tweets = list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH))
print('recents:', len(search_tweets))

#
# Get mixed tweets
#
NUM_TWEETS_TO_FETCH = 3000
params['result_type'] = 'mixed'

cursor = twitter.cursor(twitter.search, **params)
search_tweets.extend(list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH)))
print('recent + mixed:', len(search_tweets))

#
# Get popular tweets
#
NUM_TWEETS_TO_FETCH = 100
params['result_type'] = 'popular'

cursor = twitter.cursor(twitter.search, **params)
search_tweets.extend(list(itertools.islice(cursor, NUM_TWEETS_TO_FETCH)))
print('recents + mixed + popular:', len(search_tweets))

print('Saving...')
with open(f'brazilian_rawdata.json', 'w') as fp:
    json.dump(search_tweets, fp,  indent=4)
print('Saved!')

recents: 14000
recent + mixed: 17000
recents + mixed + popular: 17029


# Creating dataset

Here, we're gonna get the rawdata and turn it into a dataframe (csv file)

## Dependencies

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import json

import warnings
warnings.filterwarnings("ignore")

## Import

In [4]:
!wget https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/brazilian_rawdata.json
!wget https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/argentinian_rawdata.json

--2021-09-15 01:48:33--  https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/brazilian_rawdata.json
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/matheusmas132/Retweets-Network-Analysis/main/data/brazilian_rawdata.json [following]
--2021-09-15 01:48:33--  https://raw.githubusercontent.com/matheusmas132/Retweets-Network-Analysis/main/data/brazilian_rawdata.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94189695 (90M) [text/plain]
Saving to: ‘brazilian_rawdata.json.1’


2021-09-15 01:48:34 (191 MB/s) - ‘brazilian_rawdata.json.1’ saved [94189695/94189695]

--2021-09-15 

In [76]:
with open('argentinian_rawdata.json') as json_file:
    argentinian_raw = pd.DataFrame(json.load(json_file))

with open('brazilian_rawdata.json') as json_file:
    brazilian_raw = pd.DataFrame(json.load(json_file))

## Filter columns

In [77]:
# Get a full dataframe and return the same with
# filtered user data.
def get_user_data(df):
    user_info = pd.DataFrame(dict(df.user)).T

    df['user_id_str'] = user_info['id_str']
    df['user_name'] = user_info['name']
    df['user_screen_name'] = user_info['screen_name']
    df['user_location'] = user_info['location']
    df['is_verified_user'] = user_info['verified']
    df['user_followers_count'] = user_info['followers_count']
    df['user_friends_count'] = user_info['friends_count']

    df = df.drop(columns=['user'])

    return df

# Get a full dataframe and return the same with
# filtered retweeted tweet data.
def get_retweeted_data(df):
    retweet_info = pd.DataFrame(dict(df.retweeted_status)).T

    df['retweeted_created_at'] = retweet_info['created_at']
    df['retweeted_id_str'] = retweet_info['id_str']
    df['retweeted_text'] = retweet_info['text']
    df['retweeted_truncated'] = retweet_info['truncated']
    df['retweeted_user_id_str'] = retweet_info['user'].apply(lambda x: str(dict(x)['id_str']) if type(x) == dict else x)
    df['retweeted_user_name'] = retweet_info['user'].apply(lambda x: str(dict(x)['name']) if type(x) == dict else x)
    df['retweeted_user_screen_name'] = retweet_info['user'].apply(lambda x: str(dict(x)['screen_name']) if type(x) == dict else x)
    df['retweeted_user_location'] = retweet_info['user'].apply(lambda x: str(dict(x)['location']) if type(x) == dict else x)
    df['retweeted_is_verified_user'] = retweet_info['user'].apply(lambda x: str(dict(x)['verified']) if type(x) == dict else x)
    df['retweeted_user_followers_count'] = retweet_info['user'].apply(lambda x: str(dict(x)['followers_count']) if type(x) == dict else x)
    df['retweeted_user_friends_count'] = retweet_info['user'].apply(lambda x: str(dict(x)['friends_count']) if type(x) == dict else x)
    df['retweeted_is_quote_status'] = retweet_info['is_quote_status']
    df['retweeted_retweet_count'] = retweet_info['retweet_count']
    df['retweeted_favorite_count'] = retweet_info['favorite_count']

    df = df.drop(columns=['retweeted_status'])

    return df

# Get a full dataframe with raw tweets data and
# return filtered data.
def filter_columns(df):
    columns = ['created_at', 'id_str', 'text', 'truncated', 'user',
               'retweeted_status', 'is_quote_status', 'retweet_count',
               'favorite_count']
    df = df[columns]

    df = get_user_data(df)
    df = get_retweeted_data(df)

    return df

# Filter the data for each dataset
argentinian = filter_columns(argentinian_raw)
brazilian = filter_columns(brazilian_raw)

## Remove duplicates and add untracked tweets

In [78]:
ar_size = argentinian.shape[0]
br_size = brazilian.shape[0]

# Remove duplicates
argentinian.drop_duplicates(subset=['id_str'], inplace=True, ignore_index=True)
brazilian.drop_duplicates(subset=['id_str'], inplace=True, ignore_index=True)

print(f'Argentine: {ar_size - argentinian.shape[0]} rows was dropped after remove duplicates')
print(f'Brazil: {br_size - brazilian.shape[0]} rows was dropped after remove duplicates')

Argentine: 103 rows was dropped after remove duplicates
Brazil: 103 rows was dropped after remove duplicates


In [79]:
# Add to a given DataFrame untracked tweets, that is, tweets 
# that appeared as retweeted but weren’t in the main data.
def get_untracked_tweets(df):
    # Get rows with untracked tweets
    tracked_tweets = list(df['id_str'].values)
    non_tracked = df[(df['retweeted_id_str'].notna()) & (~df['retweeted_id_str'].isin(tracked_tweets))]

    retweeted_columns = ['retweeted_created_at', 'retweeted_id_str', 'retweeted_text', 'retweeted_truncated',
                         'retweeted_is_quote_status', 'retweeted_retweet_count', 'retweeted_favorite_count',
                         'retweeted_user_id_str', 'retweeted_user_name', 'retweeted_user_screen_name',
                         'retweeted_user_location', 'retweeted_is_verified_user', 'retweeted_user_followers_count',
                         'retweeted_user_friends_count']
    new_tweets = non_tracked[retweeted_columns]

    new_tweets = new_tweets.rename(columns={column:column.replace('retweeted_', '') for column in retweeted_columns})

    # Drop duplicated tweets
    new_tweets.drop_duplicates(subset=['id_str'], inplace=True, ignore_index=True)

    # Add untracked tweets to the main dataframe
    df = df.append(new_tweets, ignore_index=True)

    return df

ar_size = argentinian.shape[0]
br_size = brazilian.shape[0]

argentinian = get_untracked_tweets(argentinian)
brazilian = get_untracked_tweets(brazilian)

print(f'Argentine: {argentinian.shape[0] - ar_size} untracked tweets added')
print(f'Brazil: {brazilian.shape[0] - br_size} untracked tweets added')

Argentine: 1490 untracked tweets added
Brazil: 761 untracked tweets added


## Export csv

In [47]:
argentinian.to_excel('argentinian.xlsx', index=False)
brazilian.to_excel('brazilian.xlsx', index=False)

# Creating network

Now, we're gonna turn the dataframe into a network.

## Dependencies

In [48]:
!pip install networkx==2.6.2



In [49]:
import networkx as nx
import pandas as pd
import numpy as np
from pprint import pprint

## Import

In [51]:
!wget https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/argentinian.xlsx
!wget https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/brazilian.xlsx

--2021-09-15 03:02:46--  https://github.com/matheusmas132/Retweets-Network-Analysis/raw/main/data/argentinian.xlsx
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/matheusmas132/Retweets-Network-Analysis/main/data/argentinian.xlsx [following]
--2021-09-15 03:02:46--  https://raw.githubusercontent.com/matheusmas132/Retweets-Network-Analysis/main/data/argentinian.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3162978 (3.0M) [application/octet-stream]
Saving to: ‘argentinian.xlsx.1’


2021-09-15 03:02:46 (165 MB/s) - ‘argentinian.xlsx.1’ saved [3162978/3162978]

--2021-09-15 03:02:47--  https:

In [50]:
argentinian = pd.read_excel('argentinian.xlsx', converters={'id_str':str,'user_id_str':str,'retweeted_id_str':str, 'retweeted_user_id_str':str})
brazilian = pd.read_excel('brazilian.xlsx', converters={'id_str':str,'user_id_str':str,'retweeted_id_str':str, 'retweeted_user_id_str':str})

## Build the networks

In [66]:
# Get the given dataframe rows with retweets
def get_retweets_dataframe(df):
    return df[df['retweeted_id_str'].notna()]

# Add data_columns data to nodes using a dataframe
def add_node_data(D, df, id_column, data_columns):
    nodes = list(D.nodes)

    data_columns.append(id_column)

    attrs = df[df[id_column].isin(nodes)].drop_duplicates(subset=[id_column], ignore_index=True)
    attrs = attrs[data_columns].set_index(id_column).to_dict('index')

    nx.set_node_attributes(D, attrs)

    return D

# Build a network connecting tweets if there are a retweet relation
def build_tweets_network(df):
    D = nx.DiGraph()

    retweets = get_retweets_dataframe(df)

    for index, row in retweets.iterrows():
        retweeted_id = row['retweeted_id_str']
        retweeter_id = row['id_str']

        # Edge direction: retweeted_id -> retweeter_id
        if D.has_edge(retweeted_id, retweeter_id):
            D.edges[retweeted_id, retweeter_id]['weight'] += 1
        else:
            D.add_edge(retweeted_id, retweeter_id, weight=1)

    D = add_node_data(D, df, 'id_str', ['text', 'truncated', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorite_count'])

    return D

# Build a network connecting users if one retweeted the other user tweet
def build_users_network(df):
    D = nx.DiGraph()

    retweets = get_retweets_dataframe(df)

    # Edge direction: retweeted_sn -> retweeter_sn
    for index, row in retweets.iterrows():
        retweeted_sn = row['retweeted_user_screen_name']
        retweeter_sn = row['user_screen_name']

        if D.has_edge(retweeted_sn, retweeter_sn):
            D.edges[retweeted_sn, retweeter_sn]['weight'] += 1
        else:
            D.add_edge(retweeted_sn, retweeter_sn, weight=1)

    D = add_node_data(D, df, 'user_screen_name', ['user_id_str', 'user_name', 'user_location', 'is_verified_user', 'user_followers_count', 'user_friends_count'])

    return D

argentinian_tweets_net = build_tweets_network(argentinian)
brazilian_tweets_net = build_tweets_network(brazilian)

argentinian_users_net = build_users_network(argentinian)
brazilian_users_net = build_users_network(brazilian)

## Export networks

In [67]:
nx.write_graphml(argentinian_tweets_net, "argentinian_tweets_net.graphml")
nx.write_graphml(brazilian_tweets_net, "brazilian_tweets_net.graphml")

nx.write_graphml(argentinian_users_net, "argentinian_users_net.graphml")
nx.write_graphml(brazilian_users_net, "brazilian_users_net.graphml")