## **Vaccine Tweets**

In [52]:
## Libraries ## 
import credentials
import tweepy
import json
import pandas as pd
import numpy as np
import csv

### **Tweets search** 

In [2]:
# Authorization # 
auth = tweepy.OAuthHandler(credentials.API_KEY, credentials.API_SECRET_KEY)
api = tweepy.API(auth)

In [30]:
# Query: 6000 tweets including any word from: vacuna,vacunas, vacunación, vacunar; 
# in Spanish, both popular and real time results. Filter retweets
id= None 
tweets = []
for tweet in tweepy.Cursor(api.search_tweets,
                       q='vacuna OR vacunas OR vacunación OR vacunar' +  " -filter:retweets",
                       lang="es",
                       result_type='mixed',    #result_type:popular,recent,mixed.
                       max_id=id).items(6000): #max_id allows every search to have different entries
    tweets.append(tweet)
    #print(json.dumps(tweet._json, indent=2))   

TooManyRequests: 429 Too Many Requests
88 - Rate limit exceeded

In [150]:
len(tweets)

4244

In [27]:
# Create a dataframe
data = [[tweet.user.id_str, tweet.user.screen_name, tweet.text, tweet.id_str, tweet.created_at, tweet.retweet_count,
         tweet.favorite_count, tweet.user.description,tweet.user.verified, tweet.user.followers_count, 
         tweet.user.friends_count, tweet.entities['hashtags'], tweet.entities['user_mentions'], 
         tweet.entities['urls']] for tweet in tweets]


# Rename the variables
df = pd.DataFrame(data=data, columns=['user_id', 'screen_name', 'text', 'tweet_id', 'date', 'retweet_count',
                                      'favorite_count', 'user_description', 'account_verified', 
                                      'followers', 'friends', 'hashtags', 'mentions', 'links'])

In [7]:
#df.tail(5)

In [28]:
df.to_csv('Tweets_281121_vacuna.csv', index= False)

In [56]:
df = pd.read_csv('Tweets_281121_vacuna.csv')

### **Data cleaning**

In [57]:
# 1) Data exploration
df.head(2) 
df.shape 
df.info() 
df.columns
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           2500 non-null   int64 
 1   screen_name       2500 non-null   object
 2   text              2500 non-null   object
 3   tweet_id          2500 non-null   int64 
 4   date              2500 non-null   object
 5   retweet_count     2500 non-null   int64 
 6   favorite_count    2500 non-null   int64 
 7   user_description  2094 non-null   object
 8   account_verified  2500 non-null   bool  
 9   followers         2500 non-null   int64 
 10  friends           2500 non-null   int64 
 11  hashtags          2500 non-null   object
 12  mentions          2500 non-null   object
 13  links             2500 non-null   object
dtypes: bool(1), int64(6), object(7)
memory usage: 256.5+ KB


Index(['user_id', 'screen_name', 'text', 'tweet_id', 'date', 'retweet_count',
       'favorite_count', 'user_description', 'account_verified', 'followers',
       'friends', 'hashtags', 'mentions', 'links'],
      dtype='object')

In [116]:
null_cols = df.isnull().sum()
null_cols

# There are no null rows, however there are '[]' to fill the empty entries. 

# Replace '[]' for 0
df.loc[(df.hashtags == '[]'), 'hashtags'] = 0
df.loc[(df.mentions == '[]'), 'mentions'] = 0
df.loc[(df.user_description == '[]'), 'user_description'] = 0
df.loc[(df.links == '[]'), 'links'] = 0

In [149]:
# Null entries
null_cols[null_cols > 0]
drop_cols = list(null_cols[null_cols > 100].index)
df = df.drop(drop_cols, axis=1)

In [81]:
### TWEETS

# Text
uniqueTweet = df.text.nunique() #1,570/2,500
print('Unique tweets:', uniqueTweet)

# Duplicate tweets 
repeatedTweet = df['text'].duplicated().sum() 
print('Repeated tweets:', repeatedTweet)

# Tweet_id
uniqueStatus = df.tweet_id.nunique()
print('Unique status:', uniqueStatus)

repeatedStatus = df['tweet_id'].duplicated().sum() 
print('Repeated status:', repeatedStatus) 

# While every tweet has a unique ID, the tweet content(text) can be repeated.
# Whithin this sample, all the tweets are original content

Unique tweets: 2500
Repeated tweets: 0
Unique status: 2500
Repeated status: 0


In [61]:
### USERS

# Unique users
uniqueUsers = df.user_id.nunique() 
duplicateUsers = df['user_id'].duplicated().sum() 
print('Number of duplicate users:', duplicateUsers, 'from:', uniqueUsers)

# Add a new variable to identify unique or duplicate users
conteos = df.user_id.value_counts()
conteos[df.user_id]
df['User_tag'] = np.where(conteos[df.user_id] == 1, 'Unique','Duplicate')

Number of duplicate users: 562 from: 1938


In [103]:
# Identify the users with more followers 
pd.cut(df.followers, 5)
tag = ['Very popular', 'Popular', 'Few followers', 'Unpopular', 'No followers'] 
pd.cut(df.followers, 5, labels = tag) 
pd.cut(df.followers, 5, labels = tag).value_counts()

# Add a new variable to identify the followers range of the users
df['Popularity'] = pd.cut(df.followers, 5, labels=tag)

In [124]:
# Calculate 'engagement' (likes and retweets) 
df['Engagement'] = df[['retweet_count', 'favorite_count']].mean(axis=1)

# Change the 'Engagement' type of variable
df.Engagement.astype('int64')

0       11058
1        2615
2        6294
3           0
4           0
        ...  
2495        0
2496        0
2497        0
2498        8
2499        0
Name: Engagement, Length: 2500, dtype: int64

In [135]:
# Identify the most popular users 
pd.cut(df.Engagement, 4)
tag = ['Very popular', 'Popular', 'Not very popular', 'Unpopular'] 
pd.cut(df.Engagement, 4, labels = tag) 
pd.cut(df.Engagement, 4, labels = tag).value_counts()

Very popular        2498
Not very popular       1
Unpopular              1
Popular                0
Name: Engagement, dtype: int64

In [136]:
# Compare engagement verified accounts and non-verified accounts 


Unnamed: 0,user_id,screen_name,text,tweet_id,date,retweet_count,user_description,account_verified,followers,friends,hashtags,mentions,links,User_tag,Popularity,Description_TF,hashtags_TF,mentions_TF,links_TF,Engagement
0,316273207,doctormacias,"COVID19, Ómricon: No vamos a salir de la pande...",1464530933185495044,2021-11-27 09:45:58+00:00,5905,Medical doctor. Former 2009 Pandemic Influenza...,True,336923,169,0,0,"[{'url': 'https://t.co/iOxLoD79Jc', 'expanded_...",Unique,Very popular,True,False,False,True,11058.5
1,126465001,FrayJosepho,"""Nadie dijo nunca que las vacunas contra la co...",1464589515117539343,2021-11-27 13:38:45+00:00,2031,"Poeta satírico, entre otras cosas.\n\nTengo ca...",True,158866,658,0,0,"[{'url': 'https://t.co/YJp3U7e1Bx', 'expanded_...",Unique,Very popular,True,False,False,True,2615.5
2,1154180084078501889,HLGatell,Ómicron es la más reciente variante de preocup...,1464782793041395713,2021-11-28 02:26:46+00:00,3324,Subsecretario de Prevención y Promoción de la ...,True,1784747,146,0,0,"[{'url': 'https://t.co/thezTERNCY', 'expanded_...",Unique,Popular,True,False,False,True,6294.5
3,1390070059,friedmanntomas,"La presidenta de la UE Ursula von der Leyen, d...",1465052855518302210,2021-11-28 20:19:54+00:00,0,"Periodista, Corresponsal internacional,abuelo ...",False,2859,168,0,0,"[{'url': 'https://t.co/jaelYFLJ9Y', 'expanded_...",Unique,Very popular,True,False,False,True,0.0
4,1260242519025123328,DJAA78443071,@112canarias @ConanGrs @salvamentogob ¡ESTO YA...,1465052855501565952,2021-11-28 20:19:54+00:00,0,Decir la verdad no cuesta nada. Telling the tr...,False,153,210,0,"[{'screen_name': '112canarias', 'name': '1-1-2...","[{'url': 'https://t.co/z78A24OEOM', 'expanded_...",Unique,Very popular,True,False,True,True,0.0
5,132175862,Cuba1959,Cuba envía nuevo cargamento a Venezuela con má...,1465052851940511752,2021-11-28 20:19:53+00:00,0,"Miguel Fernández Martínez, periodista cubano",False,862,540,0,0,"[{'url': 'https://t.co/F4ZR0wEJGL', 'expanded_...",Unique,Very popular,True,False,False,True,0.0
6,548901527,alaniz422,@ZinClaudio @ZinClaudio la mala administración...,1465052819489280005,2021-11-28 20:19:46+00:00,0,Juan salió de la madriguera,False,326,1466,0,"[{'screen_name': 'ZinClaudio', 'name': 'Dr. Cl...","[{'url': 'https://t.co/iLcakSCNQ6', 'expanded_...",Unique,Very popular,True,False,True,True,0.0
7,219261380,lacerme,Volvemos a la casilla de salida.\nDepende de n...,1465052815336869898,2021-11-28 20:19:45+00:00,0,Mis flores favoritas son las nécoras. \nEstudi...,False,5027,876,0,0,0,Unique,Very popular,True,False,False,False,0.0
8,293435191,nl_zorro,Alerta 🚨!! Llaman “Omicron” a los efectos secu...,1465052793996197889,2021-11-28 20:19:40+00:00,0,Anti vax? NO! Anti experiments with humans! 🇬🇧...,False,1424,714,0,0,0,Unique,Very popular,True,False,False,False,0.0
9,1456711622622425091,nucleosocialarg,"No somos ""antivacunas"" \nni ""conspiranoicos"". ...",1465052753118564363,2021-11-28 20:19:30+00:00,0,"Agrupación social, joven, patriota y militante...",False,30,55,0,0,"[{'url': 'https://t.co/nMEV35SudU', 'expanded_...",Unique,Very popular,True,False,False,True,0.0


In [134]:
# Delete favorite_count and retweet_count
df.drop(columns= 'favorite_count', inplace = True)
df.drop(columns= 'retweet_count', inplace = True)



KeyError: "['favorite_count'] not found in axis"

In [100]:
# Popular users + popular tweets
popular = df[(df.Popularity == 'Very popular') & (df.favorite_count >= 100) | (df.retweet_count >= 100)]
# Most popular users get more engagement. 

weird = df[(df.Popularity == 'No followers') & (df.favorite_count >= 100)]
weird

Unnamed: 0,user_id,screen_name,text,tweet_id,date,retweet_count,favorite_count,user_description,account_verified,followers,friends,hashtags,mentions,links,User_tag,Popularity,Description_TF,hashtags_TF,mentions_TF,links_TF
189,7996082,el_pais,🔴 ÚLTIMA HORA | Países Bajos detecta 61 pasaje...,1464528628335751171,2021-11-27 09:36:49+00:00,419,606,La mejor información en español. Con nuestra m...,True,8209200,791,[],[],"[{'url': 'https://t.co/ITCQNyR4hN', 'expanded_...",Duplicate,No followers,True,False,False,True
368,7996082,el_pais,🔴 DIRECTO | Alemania confirma 67.000 nuevos co...,1464526120104169477,2021-11-27 09:26:51+00:00,104,112,La mejor información en español. Con nuestra m...,True,8209200,791,[],[],"[{'url': 'https://t.co/iDd1rUqaae', 'expanded_...",Duplicate,No followers,True,False,False,True


In [144]:
# Change the date
df.date.astype('datetime64[ns]')

0      2021-11-27 09:45:58
1      2021-11-27 13:38:45
2      2021-11-28 02:26:46
3      2021-11-28 20:19:54
4      2021-11-28 20:19:54
               ...        
2495   2021-11-27 06:52:29
2496   2021-11-27 06:52:20
2497   2021-11-27 06:52:19
2498   2021-11-27 06:52:19
2499   2021-11-27 06:52:18
Name: date, Length: 2500, dtype: datetime64[ns]

In [145]:
df.drop_duplicates(subset=['user_id','text','date', 'account_verified', 'hashtags', 'mentions', 'links', 'User_tag', 'Popularity',
       'Description_TF', 'hashtags_TF', 'mentions_TF', 'links_TF', 'Engagement'],keep='first', inplace=True)