# Twitter Scraper

In [1]:
from ntscraper import Nitter
import requests
import pprint
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import matplotlib.pyplot as plt
import time

## Manual Scraper

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def load_more_tweets(url, max_iterations=10):
    # Create a new instance of the Chrome driver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option("useAutomationExtension", False)
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    driver = webdriver.Chrome(options=chrome_options)
    
    # Open the URL in the browser
    driver.get(url)
    
    # Wait for the "Load More" button to be present
    wait = WebDriverWait(driver, 10)
    load_more_button = wait.until(EC.presence_of_element_located((By.LINK_TEXT, 'Load more')))
    
    html_content = ''
    urls = [url]
    iteration = 1
    while load_more_button and iteration <= max_iterations:
        try:
            print(f"Iteration {iteration}/{max_iterations}: Loading more tweets...")
            # Click the "Load More" button
            load_more_button.click()
            # Wait for new tweets to load
            # wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.loading')))
            time.sleep(2)
            iteration += 1
            if driver.find_elements(By.LINK_TEXT, 'No more content to load'):
                break
            # Check if there's another "Load More" button
            load_more_button = driver.find_elements(By.LINK_TEXT, 'Load more')[0] if driver.find_elements(By.LINK_TEXT, 'Load more') else False
            html_content += driver.page_source
            current_url = driver.current_url
            urls.append(current_url)
        except KeyboardInterrupt:
            print("Process interrupted by the user")
            break
        except:
            print("Some error occured")
            break
    # Get the updated page source after loading all tweets
    # html_content = driver.page_source
    
    # Close the browser
    driver.quit()
    
    return html_content, urls




In [None]:
url = 'https://nitter.poast.org/search?f=tweets&q=femminicidio&cursor=DAADDAABCgABGLsOGotWYA4KAAIYubWkwRYQcgAIAAIAAAACCAADAAAAAAgABAAAAAIKAAUYuzXeFUAnEAoABhi7Nd4VP4rQAAA#'

tic = time.time()
html_content = load_more_tweets(url, 5000)
toc = time.time()
time_taken = toc - tic
print(f"Time taken to load all tweets: {time_taken/60:.2f} minutes")
# Now you can proceed with parsing the HTML content and extracting the tweets

In [3]:
soup = BeautifulSoup(html_content[0], 'html.parser')
tweets = soup.find_all(class_="timeline-item")

In [7]:
users = []
usernames = []
dates = []
texts = []
likes = []
retweets = []
comments = []
quotes = []
urls = html_content[1]

for tweet in tweets:
    user = tweet.find(class_="fullname")
    if user != [] and user != None:
        users.append(user.text)
    username = tweet.find(class_="username")
    if username != [] and username != None:
        usernames.append(username.text)
    date = tweet.find(class_="tweet-date")
    if date != [] and date != None:
        dates.append(date.text)
    text = tweet.find(class_="tweet-content media-body")
    if text != [] and text != None:
        texts.append(text.text)
    stats = tweet.find_all(class_="icon-container")
    if stats != [] and stats != None and len(stats) == 4:
        comments.append(stats[0].text)
        retweets.append(stats[1].text)
        quotes.append(stats[2].text)
        likes.append(stats[3].text)
    elif stats != [] and stats != None and len(stats) != 4:
        comments.append(0)
        retweets.append(0)
        quotes.append(0)
        likes.append(0)


df = pd.DataFrame({'user': users, 'username': usernames, 'date': dates, 'text': texts, 'comments': comments, 'retweets': retweets, 'quotes': quotes, 'likes': likes})
df['comments'].replace('', 0, inplace=True)
df['retweets'].replace('', 0, inplace=True)
df['quotes'].replace('', 0, inplace=True)
df['likes'].replace('', 0, inplace=True)
df['comments'] = df['comments'].astype(int)
df['retweets'] = df['retweets'].astype(int)
df['quotes'] = df['quotes'].astype(int)
df['likes'] = df['likes'].astype(int)

In [8]:
print(f'len = {len(df)}, built in {time_taken/60:.2f} minutes')
df.head(20)

len = 136, built in 1.26 minutes


Unnamed: 0,user,username,date,text,comments,retweets,quotes,likes
0,Sciking,@ScikingFS,Apr 19,"Se solo le donne possono parlare di aborto, so...",0,0,0,0
1,üí≤‚ÑπÔ∏è„ÄΩÔ∏è‚≠ï.üá™üá∫üá∫üá¶ #Il Silenzio √© Sottovalutato‚ù£Ô∏è¬©ü§´,@Simo42953017,Apr 19,#femminicidio #femminicidi,0,0,0,0
2,pubblicanow.it,@pubblicanow,Apr 19,Paesi Baschi: aiuto annuale per gli orfani di ...,0,0,0,0
3,Italy in Austria,@ItalyinAustria,Apr 19,L'Ambasciata d'Italia a Vienna aderisce alla c...,0,0,0,0
4,ùôàùôñùôßùô† ùôã. ùôáùôö ùôÉùôñùô´ùôßùôö,@marklehavre,Apr 19,Bhe speriamo che non sia per qualche femminici...,0,0,0,1
5,FemminicidioItalia.info,@FemminicidioIta,Apr 19,ACCADDE OGGI ‚Äì Il femminicidio di Romina Vento...,0,0,0,0
6,Gianfranco Pezzoni,@GianfrancoPezz3,Apr 19,"Per√≤, questi docenti! Conoscono altri miracoli...",0,0,0,0
7,ùó•ùóîùóóùóúùóîùó°ùóßùóò ùó¶ùó®ùóó,@RadianteSud,Apr 19,TENTATO FEMMINICIDIO A CARBONIA: DONNA SALVA D...,0,0,0,0
8,Enrica Maina,@mayaevelin71,Apr 19,invidious.poast.org/NfEp5l0UMBE?si=ZfOH‚Ä¶ Salvi...,0,0,0,0
9,ore14rai2,@ore14rai2,Apr 18,"Femminicidio di Aosta, sopralluoghi a La Salle...",0,0,0,0


In [166]:
df.to_csv('femminicidio_22_04_1.csv')

In [11]:
with open('last_url.txt', 'a') as f:
    f.write('\n')
    for url in urls:
        f.write(f'{url}\n')

## REPEAT

In [12]:
with open('last_url.txt', 'r') as f:
    for line in f:
        url = line.strip()


'https://nitter.poast.org/search?f=tweets&q=femminicidio&cursor=DAADDAABCgABGLsOGotWYA4KAAIYtZu9nFaQnAAIAAIAAAACCAADAAAAAAgABAAAAAwKAAUYuzXeFUAnEAoABhi7Nd4VPgQwAAA'

In [176]:
tic = time.time()
html_content = load_more_tweets(url, 5000)
toc = time.time()
time_taken = toc - tic
print(f"Time taken to load all tweets: {time_taken/60:.2f} minutes")

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7BB467032+63090]
	(No symbol) [0x00007FF7BB3D2C82]
	(No symbol) [0x00007FF7BB26EC65]
	(No symbol) [0x00007FF7BB2B499D]
	(No symbol) [0x00007FF7BB2B4ADC]
	(No symbol) [0x00007FF7BB2F5B37]
	(No symbol) [0x00007FF7BB2D701F]
	(No symbol) [0x00007FF7BB2F3412]
	(No symbol) [0x00007FF7BB2D6D83]
	(No symbol) [0x00007FF7BB2A83A8]
	(No symbol) [0x00007FF7BB2A9441]
	GetHandleVerifier [0x00007FF7BB8625AD+4238317]
	GetHandleVerifier [0x00007FF7BB89F70D+4488525]
	GetHandleVerifier [0x00007FF7BB8979EF+4456495]
	GetHandleVerifier [0x00007FF7BB540576+953270]
	(No symbol) [0x00007FF7BB3DE54F]
	(No symbol) [0x00007FF7BB3D9224]
	(No symbol) [0x00007FF7BB3D935B]
	(No symbol) [0x00007FF7BB3C9B94]
	BaseThreadInitThunk [0x00007FFED82F7344+20]
	RtlUserThreadStart [0x00007FFED95026B1+33]


In [None]:
soup = BeautifulSoup(html_content[0], 'html.parser')
tweets = soup.find_all(class_="timeline-item")
users = []
usernames = []
dates = []
texts = []
likes = []
retweets = []
comments = []
quotes = []

for tweet in tweets:
    user = tweet.find(class_="fullname")
    if user != [] and user != None:
        users.append(user.text)
    username = tweet.find(class_="username")
    if username != [] and username != None:
        usernames.append(username.text)
    date = tweet.find(class_="tweet-date")
    if date != [] and date != None:
        dates.append(date.text)
    text = tweet.find(class_="tweet-content media-body")
    if text != [] and text != None:
        texts.append(text.text)
    stats = tweet.find_all(class_="icon-container")
    if stats != [] and stats != None and len(stats) == 4:
        comments.append(stats[0].text)
        retweets.append(stats[1].text)
        quotes.append(stats[2].text)
        likes.append(stats[3].text)
    elif stats != [] and stats != None and len(stats) != 4:
        comments.append(0)
        retweets.append(0)
        quotes.append(0)
        likes.append(0)

df = pd.DataFrame({'user': users, 'username': usernames, 'date': dates, 'text': texts, 'comments': comments, 'retweets': retweets, 'quotes': quotes, 'likes': likes})
df['comments'].replace('', 0, inplace=True)
df['retweets'].replace('', 0, inplace=True)
df['quotes'].replace('', 0, inplace=True)
df['likes'].replace('', 0, inplace=True)
df['comments'].replace(',', '', inplace=True)
df['retweets'].replace(',', '', inplace=True)
df['quotes'].replace(',', '', inplace=True)
df['likes'].replace(',', '', inplace=True)
df['comments'] = df['comments'].astype(int)
df['retweets'] = df['retweets'].astype(int)
df['quotes'] = df['quotes'].astype(int)
# df['likes'] = df['likes'].astype(int)
print(f'len = {len(df)}, built in {time_taken/60:.2f} minutes')
df.head(20)

len = 169, built in 1.39 minutes


Unnamed: 0,user,username,date,text,comments,retweets,quotes,likes
0,chia_gio,@chia_gi0,Mar 17,"Ma i figli delle vittime di #femminicidio, orf...",0,1,0,4
1,Valeria,@valeria_frezza,Mar 17,#bastaviolenzacontroledonne\n@forumalcentro\n#...,0,8,0,6
2,clod üçâ,@fohcloud,Mar 17,se mia mamma urlasse alla televisione quando s...,0,0,0,3
3,ciancianella,@ciancianella,Mar 17,DUE femminicidi ieri.\nE poi i trollini incel ...,0,5,0,9
4,Chiara Nata Stanca,@Chiaraandy,Mar 17,Ancora e ancora e ancora\n\nADESSO BASTA!\n\nT...,2,7,0,35
5,Lajoda,@lajodina,Mar 17,"non vorrei contraddirti, ma nei casi di femmin...",0,0,0,3
6,alice,@atterismo,Mar 17,Ennesimo femminicidio. Due vite distrutte non ...,0,35,1,236
7,MonikÓ®Äda Vicenza ‚òÆÔ∏èpacifistaüè≥Ô∏è‚Äçüåà,@BiasiMonica,Mar 17,#femminicidio\n#stragecontinua,0,0,0,4
8,Gerri Liu,@GerriLiu5,Mar 17,#femminicidio üíîüíî,0,0,0,0
9,Alessandro,@russo_sandro74,Mar 17,Ogni maledettissimo giorno üò≠ #femminicidio #no...,0,0,0,0


In [None]:
# modifica nome csv!!!!
df.to_csv('femminicidio_21_04_2.csv')

In [None]:
with open('last_url_21_04_2.txt', 'w') as f:
    f.write(html_content[1])

## WHILE LOOP

In [14]:
df =  pd.read_csv('femminicidio_22_04_2(whole_night).csv')

In [16]:
i = 0
while len(df) <= 100000:
    with open('last_url.txt', 'r') as f:
        for line in f:
            url = line.strip()
    tic = time.time()
    html_content = load_more_tweets(url, 5000)
    toc = time.time()
    time_taken = toc - tic
    print(f"Time taken to load all tweets: {time_taken/60:.2f} minutes")

    soup = BeautifulSoup(html_content[0], 'html.parser')
    tweets = soup.find_all(class_="timeline-item")
    users = []
    usernames = []
    dates = []
    texts = []
    likes = []
    retweets = []
    comments = []
    quotes = []
    urls = html_content[1]

    for tweet in tweets:
        user = tweet.find(class_="fullname")
        if user != [] and user != None:
            users.append(user.text)
        username = tweet.find(class_="username")
        if username != [] and username != None:
            usernames.append(username.text)
        date = tweet.find(class_="tweet-date")
        if date != [] and date != None:
            dates.append(date.text)
        text = tweet.find(class_="tweet-content media-body")
        if text != [] and text != None:
            texts.append(text.text)
        stats = tweet.find_all(class_="icon-container")
        if stats != [] and stats != None and len(stats) == 4:
            comments.append(stats[0].text)
            retweets.append(stats[1].text)
            quotes.append(stats[2].text)
            likes.append(stats[3].text)
        elif stats != [] and stats != None and len(stats) != 4:
            comments.append(0)
            retweets.append(0)
            quotes.append(0)
            likes.append(0)

    new_df = pd.DataFrame({'user': users, 'username': usernames, 'date': dates, 'text': texts, 'comments': comments, 'retweets': retweets, 'quotes': quotes, 'likes': likes})
    new_df['comments'].replace('', 0, inplace=True)
    new_df['retweets'].replace('', 0, inplace=True)
    new_df['quotes'].replace('', 0, inplace=True)
    new_df['likes'].replace('', 0, inplace=True)
    new_df['comments'].replace(',', '', inplace=True)
    new_df['retweets'].replace(',', '', inplace=True)
    new_df['quotes'].replace(',', '', inplace=True)

    df = pd.concat([df, new_df])
    print(f'len = {len(df)}, built in {time_taken/60:.2f} minutes')

    with open('last_url.txt', 'a') as f:
        for url in urls:
            f.write(f'{url}\n')

Iteration 1/5000: Loading more tweets...
Iteration 2/5000: Loading more tweets...
Iteration 3/5000: Loading more tweets...
Iteration 4/5000: Loading more tweets...
Iteration 5/5000: Loading more tweets...
Iteration 6/5000: Loading more tweets...
Iteration 7/5000: Loading more tweets...
Iteration 8/5000: Loading more tweets...
Iteration 9/5000: Loading more tweets...
Iteration 10/5000: Loading more tweets...
Iteration 11/5000: Loading more tweets...
Iteration 12/5000: Loading more tweets...
Iteration 13/5000: Loading more tweets...
Iteration 14/5000: Loading more tweets...
Iteration 15/5000: Loading more tweets...
Iteration 16/5000: Loading more tweets...
Iteration 17/5000: Loading more tweets...
Iteration 18/5000: Loading more tweets...
Iteration 19/5000: Loading more tweets...
Iteration 20/5000: Loading more tweets...
Iteration 21/5000: Loading more tweets...
Iteration 22/5000: Loading more tweets...
Iteration 23/5000: Loading more tweets...
Iteration 24/5000: Loading more tweets...
I

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=123.0.6312.124)
Stacktrace:
	GetHandleVerifier [0x00007FF602607032+63090]
	(No symbol) [0x00007FF602572C82]
	(No symbol) [0x00007FF60240EC65]
	(No symbol) [0x00007FF6023ECA7C]
	(No symbol) [0x00007FF60247D687]
	(No symbol) [0x00007FF602492AC1]
	(No symbol) [0x00007FF602476D83]
	(No symbol) [0x00007FF6024483A8]
	(No symbol) [0x00007FF602449441]
	GetHandleVerifier [0x00007FF602A025AD+4238317]
	GetHandleVerifier [0x00007FF602A3F70D+4488525]
	GetHandleVerifier [0x00007FF602A379EF+4456495]
	GetHandleVerifier [0x00007FF6026E0576+953270]
	(No symbol) [0x00007FF60257E54F]
	(No symbol) [0x00007FF602579224]
	(No symbol) [0x00007FF60257935B]
	(No symbol) [0x00007FF602569B94]
	BaseThreadInitThunk [0x00007FFED82F7344+20]
	RtlUserThreadStart [0x00007FFED95026B1+33]


In [17]:
df

Unnamed: 0.1,Unnamed: 0,user,username,date,text,comments,retweets,quotes,likes
0,0.0,Sciking,@ScikingFS,Apr 19,"Se solo le donne possono parlare di aborto, so...",0,0,0,0
1,1.0,üí≤‚ÑπÔ∏è„ÄΩÔ∏è‚≠ï.üá™üá∫üá∫üá¶ #Il Silenzio √© Sottovalutato‚ù£Ô∏è¬©ü§´,@Simo42953017,Apr 19,#femminicidio #femminicidi,0,0,0,0
2,2.0,pubblicanow.it,@pubblicanow,Apr 19,Paesi Baschi: aiuto annuale per gli orfani di ...,0,0,0,0
3,3.0,Italy in Austria,@ItalyinAustria,Apr 19,L'Ambasciata d'Italia a Vienna aderisce alla c...,0,0,0,0
4,4.0,ùôàùôñùôßùô† ùôã. ùôáùôö ùôÉùôñùô´ùôßùôö,@marklehavre,Apr 19,Bhe speriamo che non sia per qualche femminici...,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1706,,Marco Donadel,@Marco10269049,16 Jun 2023,Italian culture is:\nMuoiono persone in mare? ...,0,0,0,0
1707,,gabriele farina,@solops,16 Jun 2023,"Femminicidio a Incisa Scapaccino, custodia in ...",0,0,0,0
1708,,Silvestro,@melopappo,15 Jun 2023,"D'altronde, ogni volta che c'√® un femminicidio...",0,0,0,0
1709,,The Baseball Furies,@DavideR46325615,15 Jun 2023,Francesco Lollobrigida collega femminicidio e ...,0,0,0,0


In [27]:
df2 = df.drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [28]:
df2

Unnamed: 0,user,username,date,text,comments,retweets,quotes,likes
0,Sciking,@ScikingFS,Apr 19,"Se solo le donne possono parlare di aborto, so...",0,0,0,0
1,üí≤‚ÑπÔ∏è„ÄΩÔ∏è‚≠ï.üá™üá∫üá∫üá¶ #Il Silenzio √© Sottovalutato‚ù£Ô∏è¬©ü§´,@Simo42953017,Apr 19,#femminicidio #femminicidi,0,0,0,0
2,pubblicanow.it,@pubblicanow,Apr 19,Paesi Baschi: aiuto annuale per gli orfani di ...,0,0,0,0
3,Italy in Austria,@ItalyinAustria,Apr 19,L'Ambasciata d'Italia a Vienna aderisce alla c...,0,0,0,0
4,ùôàùôñùôßùô† ùôã. ùôáùôö ùôÉùôñùô´ùôßùôö,@marklehavre,Apr 19,Bhe speriamo che non sia per qualche femminici...,0,0,0,1
...,...,...,...,...,...,...,...,...
40916,Marco Donadel,@Marco10269049,16 Jun 2023,Italian culture is:\nMuoiono persone in mare? ...,0,0,0,0
40917,gabriele farina,@solops,16 Jun 2023,"Femminicidio a Incisa Scapaccino, custodia in ...",0,0,0,0
40918,Silvestro,@melopappo,15 Jun 2023,"D'altronde, ogni volta che c'√® un femminicidio...",0,0,0,0
40919,The Baseball Furies,@DavideR46325615,15 Jun 2023,Francesco Lollobrigida collega femminicidio e ...,0,0,0,0


In [29]:
df2.to_csv('femminicidio_22_04_4(afternoon).csv')