# IMPORTING USEFUL LIBRARIES 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from getpass import getpass
import pandas as pd
import re
import numpy as np

# LAUNCHING THE WEBDRIVER 

In [2]:
driver = webdriver.Chrome() 
driver.maximize_window()

# LOG-IN TO TWITTER

#### **Taking your log-in details**

In [None]:
username_ = input('Username: ') #prompts you for username
password_ = getpass('Password: ') #prompts you for password

### Login function

In [4]:
def login(username=username_, password=password_):
    """takes two arguements username and pasword with default values set to your log-in details entered above;\
        it return None but successfully logs you into your twitter account via the chrome webdriver.
        NB: function applies sleep after every executable action to give driver allowance before performin the next action.
    """
    driver.get('https://twitter.com/login')
    sleep(5)
    log_in = driver.find_element_by_xpath('.//input[@name="session[username_or_email]"]')
    log_in.send_keys(username)
    sleep(2)
    password_ = driver.find_element_by_xpath('//input[@name="session[password]"]')
    password_.send_keys(password)
    sleep(2)
    password_.send_keys(Keys.RETURN)
    sleep(5)
    return 
login()

#  SEARCHING FOR KEYWORD (#EndSars)

### taking your search keyword and tweet category

In [5]:
search_ = input('Search word: ')  #prompts for the keyword
tab_ = input('Top/Latest ? : ')  #gets what category of tweet you need, i.e either Top or Latest

Search word: #EndSars
Top/Latest ? : Top


### search function (search_func)

In [6]:
def search_func(search_word=search_, tab=tab_):
    """takes search_word and tab as arguement with defaults as prompted by the user above;
        returns None but navigates you to the search results page on twitter.
    """
    sleep(5)
    search = driver.find_element_by_xpath('.//input[@placeholder="Search Twitter"]')
    search.send_keys(search_word)
    sleep(2)
    search.send_keys(Keys.RETURN)
    sleep(2)
    driver.find_element_by_link_text(tab).click()
    sleep(3)
    return
search_func()

# GETTING TWEETS DATA

### card scraping data function

In [7]:
def card_scrap(card):
    """ takes a card from the twitter search result page and return a dictionary containing username, handle, 
        number of replies, retweets, and likes, tweet text, emoji and its title of a card been parsed.
    """
    username = card.find_element_by_xpath('./div[2]/div[1]//span').text          #stores username of card
    # a check for omitting sponsored tweets / ads 
    try:
        handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
        date = card.find_element_by_xpath('.//time').get_attribute('datetime')
    except NoSuchElementException:
        return
    
    tweet = card.find_element_by_xpath('./div[2]/div[2]/div[1]').text            #stores tweet texts of card
    
    reply = card.find_element_by_xpath('.//div[@data-testid="reply"]').text      #stores number of tweet replies
    retweet = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text  #stores number of retweets
    like = card.find_element_by_xpath('.//div[@data-testid="like"]').text        #stores number of likes
    
    emoji_list = []
    emoji_titles = []
    emoji_tags = card.find_elements_by_xpath('.//img[contains(@src, "emoji")]')
    #loops through all emoji img tags and extracts emoji titles and characters
    for emoji in emoji_tags:
        emojiLink = emoji.get_attribute('src')
        emoji_title = emoji.get_attribute('title') #gets the emoji title
        try:
            emoji = chr(int(re.search('/svg/([0-9a-z]+)\.svg', emojiLink).group(1), base=16)) #gets the emoji
        except AttributeError:
            continue
        if emoji:
            emoji_list.append(emoji)
        if emoji_title:
            emoji_titles.append(emoji_title)
    
    emojis = ' '.join(emoji_list)
    emojiTitles = ' '.join(emoji_titles)
        
    
    dict_ = {'USERNAME': username, 'HANDLE': handle, 'DATE': date,
             'TWEET': tweet, "REPLY COUNT": reply,
             'RETWEET COUNT': retweet, 'LIKE COUNT': like,
             'EMOJIS': emojis, 'EMOJI TITLES': emojiTitles}
    return dict_

### page scrolling while loop

In [8]:
scrolling  = True
cards_len = []                                                          #stores the number of cards on every scrolled page
lastHeight = driver.execute_script('return document.body.scrollHeight') #gets the last position before a scroll
scrolls = 0                                       #stores the total number of scrolls and initialized with 0 count

tweet_data = []   #store actual tweet data
tweet_ids = set() #stores the tweet ids to avoid replicas of tweets

#scroll loop: set a loop to keep scrolling until scrolling is set to False
while scrolling: 
    cards = driver.find_elements_by_xpath('.//div[@data-testid="tweet"]')  #gets all cards on each page
    cards_len.append(len(cards))   #gets the number of cards on each page and appends value to the cards_len
    
    for card in cards:
        tweet = card_scrap(card)
        if tweet:
            handle = tweet.get('HANDLE')
            date = tweet.get('DATE')
            tweet_ = tweet.get('TWEET')
            tweet_id = ''.join([handle, date, tweet_]) #uses the handle, date and tweet as unique identifier of each card scraped
            if tweet_id not in tweet_ids: 
                tweet_ids.add(tweet_id)
                tweet_data.append(tweet)
                
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #performs the scrolling operation
    scrolls+=1 
    sleep(5) #sleeps the loop for 5secs to allow page load completely
    newHeight = driver.execute_script('return document.body.scrollHeight') #takes the current position of the page
    checkScroll = 0 #a check for the number of scroll
    
    #scroll checking loop: checks if the the driver truly hits its final scroll
    while True:
        if lastHeight==newHeight:
            #breaks both while loops after 4 consecutive scrolls if page position remains same
            if checkScroll > 4: 
                scrolling = False
                break
            #keeps scrolling driver until the checkScroll exceeds 4 i.e maximum "scroll checking loop" is done 4 times
            else: 
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                checkScroll+=1  #adds 1 to total checkScroll,
                sleep(3)
                newHeight = driver.execute_script('return document.body.scrollHeight')
        #breaks "scroll checking loop" if pervious page height is not same as current page height
        else: 
            lastHeight = newHeight
            break

#closing the driver.
driver.close()

# SAVING TWEET DATA

### using the pandas library's .to_csv() method

In [9]:
username = []
handle = []
datetime = []
text = []
emoji =  []
emoji_titles = []
likes = []
retweets = []

#loops through tweet data and stores all data in their appropriate category.
for data in tweet_data:
    username.append(data.get('USERNAME'))
    handle.append(data.get('HANDLE'))
    datetime.append(data.get('DATE'))
    text.append(data.get('TWEET'))
    emoji.append(data.get('EMOJIS'))
    emoji_titles.append(data.get('EMOJI TITLES'))
    likes.append(data.get('LIKE COUNT'))
    retweets.append(data.get('RETWEET COUNT'))

In [10]:
#create a dataframe of the tweet data
final_data = pd.DataFrame({'UserName': username, 'Handle': handle, 'DateTime': datetime, 
                           'Text': text, 'Emoji': emoji, 'EmojiTitles': emoji_titles, 
                           'Likes': likes, 'Retweets': retweets}
                          )
final_data.head() #view top five tweet

Unnamed: 0,UserName,Handle,DateTime,Text,Emoji,EmojiTitles,Likes,Retweets
0,unruly,@getunruly,2021-05-25T21:14:38.000Z,Dazed did a thing. Remembering them 20 - 10 - ...,🕊,,879,356
1,Chris Okafor,@ChrisOkafor11,2021-05-26T05:44:18.000Z,I remember the deafening noise around this duo...,,,701,761
2,‘Dayo,@oladayoraji_,2021-05-25T06:36:08.000Z,These shots will never not be surreal... let’...,,,536,210
3,LOVE-DENNIS,@dennisaunch,2021-05-26T05:59:20.000Z,"Please retweet as soon as you see this , innoc...",🧢,Billed cap,1.3K,2.5K
4,FS Yusuf,@FS_Yusuf_,2021-05-26T05:34:30.000Z,Emefiele that blocked bank accounts of several...,,,2.6K,1.8K


In [11]:
final_data.tail() #view bottom five

Unnamed: 0,UserName,Handle,DateTime,Text,Emoji,EmojiTitles,Likes,Retweets
163,Iyabeni Patrick #ThingsDeyHappen,@IyabeniP,2021-05-28T08:35:20.000Z,Shameful #EndSARS #EndPoliceBrutalityinNigeria,💔,Broken heart,,
164,Agba Jalingo,@agbajalingo_,2021-05-28T14:35:43.000Z,VIDEO: Donald Duke has returned to the same PD...,,,4.0,3.0
165,Young pac,@richieedet,2021-05-28T15:42:48.000Z,And he want us to vote him as our President. G...,,,,
166,QueenVLion,@QueenVLion1,2021-05-27T15:55:32.000Z,#OromoProtests #ShutItAllDown #AnglophoneCrisi...,👑 🚨 🗣,Crown Police cars revolving light,1.0,
167,black venus,@purple_pastry,2021-05-27T10:21:12.000Z,WOW!!! MUST READ AND RT TO SAVE A LIFE!! #EndP...,,,,


In [12]:
#shape of data collected
final_data.shape

(168, 8)

## saving the dataframe into a csv file.

In [13]:
final_data.to_csv('EndSarsTopTweets.csv')

#### little statistics on scraping

In [14]:
scrolls # number of scrolls achieved 

24

In [15]:
cards_len # number of cards on each page

[6, 9, 8, 9, 10, 7, 11, 8, 7, 8, 11, 9, 8, 9, 8, 9, 9, 10, 10, 8, 9, 10, 9, 6]

In [16]:
np.mean(cards_len) # average number of cards on each page

8.666666666666666