# Tweets extractor

Ferramenta para extrair tweets de um dado utilizador (usando o nome de utilizador) utilizando a API oficial do twitter.


É necessario:
1) Chaves de autenticaçao para API.
2) Utilizar biblioteca tweepy.
3) Fazer os pedidos necessario a API.


Para utilizar o Tweepy existem 4 passos a executar:  
1) Importar o package;
2) Definir as credenciais de autenticaçao;
3) Instanciar a API;
4) Crear o objeto API; 


In [149]:
import os
import re  ## To use regular expressions
import tweepy
import json
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [100]:
# authenticating twitter api credentials
consumer_key = os.getenv('consumer_key')
consumer_secret = os.getenv('consumer_secret')
access_token = os.getenv('access_token')
access_token_secret = os.getenv('access_token_secret')


In [101]:

# instantiating the api
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# creating API object
api = tweepy.API(auth)

In [172]:
user = api.get_user(screen_name='filipaa_r99')
'''
status_count - The number of Tweets (including retweets) issued by the user
favourites_count - number of posts liked by the user
'''



user

User(_api=<tweepy.api.API object at 0x7fce21a33850>, _json={'id': 1377158664, 'id_str': '1377158664', 'name': 'Filipa', 'screen_name': 'filipaa_r99', 'location': 'Braga, Portugal', 'profile_location': None, 'description': '22y \n🎓UM-LEI 💾 \nig:filipa_cr99', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 256, 'friends_count': 91, 'listed_count': 2, 'created_at': 'Wed Apr 24 14:25:36 +0000 2013', 'favourites_count': 18322, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 26820, 'lang': None, 'status': {'created_at': 'Wed May 18 14:27:54 +0000 2022', 'id': 1526932595530948609, 'id_str': '1526932595530948609', 'text': 'A Catarina até em personagem fala mal do Nuno #bbtvi', 'truncated': False, 'entities': {'hashtags': [{'text': 'bbtvi', 'indices': [46, 52]}], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Andro

In [160]:
def extract_last_n_tweets(username, n = 10):
    '''
    n, numero de tweets a extrair
    user, utilizador que se pretende procurar
    '''
    try:
        new_tweets = tweepy.Cursor(api.user_timeline, screen_name=username, tweet_mode='extended').items(n)
        print(new_tweets)

        return True, new_tweets
    except:
        m = "extract_last_n_tweets] erro duranre a extraçao de tweets de um utilizador " + username
    return False, m



In [164]:
def extract_user_info(username):
    try:
            user = api.get_user(screen_name=username)
            '''
            status_count - The number of Tweets (including retweets) issued by the user
            favourites_count - number of posts liked by the user
            '''
            """ Extrair numeros da descriçao.
            É muito frequente na descriçao os utilziadores colocarem a seua idade"""
            nums = re.findall(r'\d+', user.description.replace('\n', ' ')) 
            n = 0
            for num in nums:
                if int(num )> 16 and int(num )< 80:
                    n = int(num)

            followers_count =user.followers_count
            friends_count = user.friends_count
            statuses_count =  user.statuses_count
            favourites_count =  user.favourites_count
            description =  user.description.replace('\n', ' ')
            protected = user.protected
            age = n
            
            if protected == True:
                return False, 'Esta conta é protegida'


            '''
            Crir o objeto obj para retornar
            '''
            obj = {
            'username': username,
            'followers_count': followers_count,
            'statuses_count': statuses_count,
            'friends_count' : friends_count,
            'favourites_count' : favourites_count, 
            'description': description,
            'age':age,
            }

           

            return True, obj

    except:
        m = "[extract_user_info] erro duranre a extraçao da info de um utilizador " + username
 
    return False, m

In [208]:
def avg (list):
    return (sum(list)//len(list))

In [220]:


def range_date (date):
  list_minutes = []
  initial = date[0]
  for next_date in date[1:]:
    range = initial - next_date 
    minutes = range.total_seconds() // 60
    list_minutes.append(minutes)
    initial = next_date
  return avg(list_minutes)



In [217]:
if not os.path.exists('TweetsByUser'):
    os.makedirs('TweetsByUser')


def main (users):
    for username in users:
        
        '''
        1) Extrair informaçao do utilizador
        '''
        value, userdata = extract_user_info (username)
        
        if value == False:
            print(userdata)
            continue

                
        '''
        Extrair os ultimos 25% (1/4) dos ultimos tweets publicados 
        '''
       # print(userdata['statuses_count']*5 // 100)
        value, userTweets = extract_last_n_tweets (username, userdata['statuses_count']*5 // 100)
        print(value)

        if value == False:
            continue

        list_tweets = []
        list_dates = []

        for tweet in userTweets:

            text = tweet._json["full_text"]
           

            '''
            Garantir que apenas ficam os posts originais, de movo a evitar desvios
            '''
            
            if 'RT ' not in text:
                refined_tweet = {'text' : text,
                    'likes_count' : tweet.favorite_count,
                    'retweet_count' : tweet.retweet_count,
                    'created_at' : tweet.created_at.strftime("%m/%d/%Y, %H:%M:%S")}
                list_dates.append (tweet.created_at)
                list_tweets.append(refined_tweet)

        '''
        Calcular diferença entre os intervalos em minutos;
        '''
        
        #avg=pd.to_datetime(pd.Series(list_dates)).mean()
        
        avg= range_date(list_dates)
        userdata['periodicity'] = avg


        
        userdata['tweets'] = list_tweets
        
        #print(userdata)
        
        with open("TweetsByUser/" + username +  ".json", "w", encoding = 'utf8') as jsonFile:
            json.dump(userdata, jsonFile, indent = 4, ensure_ascii=False)

        
       # df.to_csv('TweetsByUser/'+ username +'.csv')

        


In [221]:
users = ['carlamgoncalve5']
N =  20
main(users)

<tweepy.cursor.ItemIterator object at 0x7fce100cc8b0>
True
56520.0
