## Data Colletion from Twitter

In [25]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk
from tqdm import tqdm
import os

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [29]:
from datetime import datetime, timedelta

def date_range(start, end):
    start_date = datetime.strptime(start, '%Y-%m-%d').date()
    end_date = datetime.strptime(end, '%Y-%m-%d').date()
    delta = end_date - start_date 
    days = [start_date + timedelta(days=i) for i in range(delta.days + 1)]
    return list(map(lambda n: n.strftime("%Y-%m-%d"), days))

data_collection_period = date_range('2022-01-01', '2022-09-01')

In [30]:
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('[.*?\-…]',' ', temp)
    temp = re.sub('&amp;','and', temp)
    temp = re.sub("\n"," ", temp)
    temp = re.sub("\t"," ", temp)
    temp = re.sub("[^a-z0-9À-ž ]","", temp)
    temp = temp.split()
    temp = " ".join(word for word in temp)
    return temp
def get_most_recent_tweets(city, amount, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'content']]
    df_city = df_city.rename(columns={"content": "rawContent"})
    # df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{amount}_most_recent_tweets.csv")
    return df_city


def get_tweets_over_period(city, amount_per_day, data_collection_period, distance='20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[0]} until:{data_collection_period[1]} ').get_items(), amount_per_day))[['date', 'content']]
    for i in tqdm(range(1, len(data_collection_period)-1)):
        df_temp = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[i]} until:{data_collection_period[i+1]} ').get_items(), amount_per_day))[['date', 'content']]
        df_city = df_city.append(df_temp)
    df_city = df_city.rename(columns={"content": "rawContent"})
    os.makedirs(f"data/{city}", exist_ok = True)
    df_city.to_csv(f"data/{city}/{amount_per_day*len(data_collection_period)}_tweets_over_period.csv")
    return df_city

def clean_tweets_from_file(file_name):
    df_city = pd.read_csv(file_name, index_col=[0])
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city['id'] = range(1, len(df_city)+1)
    df_city = df_city[['id'] + [x for x in df_city.columns if x != 'id']]
    df_city.set_index('id')
    df_city.to_csv(file_name)
    return df_city


In [34]:
temp_df = clean_tweets_from_file('data/New Delhi/24400_tweets_over_period.csv')

In [17]:
# temp_df.head(150)
# clean_tweet("I <3 u")

In [33]:
%%time
# get_most_recent_tweets("Singapore", 3000, distance='20km')
get_tweets_over_period("New Delhi", 100, data_collection_period)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242/242 [19:11<00:00,  4.76s/it]

CPU times: user 21.3 s, sys: 1.35 s, total: 22.7 s
Wall time: 19min 16s





Unnamed: 0,date,rawContent
0,2022-01-01 23:53:54+00:00,https://t.co/6G6D6MPByk
1,2022-01-01 23:52:38+00:00,Prabh chit aae puran sab kaaj \nHar bisrat sab ka mohtaaj \nWaheguru ji ka khalsa \nWaheguru ji ki fateh 🙏🙏🙏 https://t.co/QEbyupndfh
2,2022-01-01 23:52:23+00:00,"@IndiGo6E I am getting confused , kindly clarify regarding state guidelines. I am traveling from Delhi to mumbai. I am fully vaccinated . Do I still need to produce negative RTPCR ?"
3,2022-01-01 23:51:34+00:00,6. Please share as much as you can. https://t.co/KgVmIU7hho
4,2022-01-01 23:51:30+00:00,5. https://t.co/iEok1dMhfZ
...,...,...
95,2022-08-31 22:20:49+00:00,"@artibra_nft @_Valentina_8_ @artalicjaanton @LucMertens1 @moodgooru @NFTbagari @MetaGirlArt @sansosea The Power of Gaze\n0.04 $ETH\nhttps://t.co/tHeA60ZhR3\nThe consequences of the chaos has been fatal in many cases and also makes the life unpredictable. Our will power and mental strength becomes very important , the way we perceive the world https://t.co/L3f4pJLxcP"
96,2022-08-31 22:19:24+00:00,@ColinsDoodles @00point1 @OhHungryArtist Beautiful work colin ❤️❤️
97,2022-08-31 22:18:52+00:00,@OhHungryArtist @00point1 Thank you so much daria ❤️
98,2022-08-31 22:18:34+00:00,@JeffRolandMagic @00point1 @OhHungryArtist Absolutely beautiful work Jeff ❤️❤️


In [26]:
%%time
get_most_recent_tweets("New York", 3000, distance='20km')

CPU times: user 1.75 s, sys: 85 ms, total: 1.83 s
Wall time: 2min 4s


Unnamed: 0,date,rawContent,cleaned_content
0,2022-10-04 12:46:15+00:00,@ManotoNews خواهراش میگه که میخوان مث مادرش باشن\n#مهسا_امینی \n#اعتصابات_سراسری \n#OpIran,
1,2022-10-04 12:46:14+00:00,🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡 https://t.co/U8HadmmAO8,
2,2022-10-04 12:46:14+00:00,Simply Be Well -Six Plant Based Body Soaps - Boxed Set NEW https://t.co/AmoTUCY5vY #eBay via @eBay,simply be well six plant based body soaps boxed set new via
3,2022-10-04 12:46:14+00:00,"$IDEX - Ideanomics, Solectrac secured new business contracts to supply electric tractors to major fleet operators https://t.co/3sOiHMyi9S",idex ideanomics solectrac secured new business contracts to supply electric tractors to major fleet operators
4,2022-10-04 12:46:14+00:00,@DrewOnline @reggaeology What petcentage of motorbikes on the road are legally licensed?,what petcentage of motorbikes on the road are legally licensed
...,...,...,...
2995,2022-10-04 12:40:08+00:00,@jeongsteph You’ve been manifesting this haha congrats @jeongsteph !!!,youve been manifesting this haha congrats
2996,2022-10-04 12:40:08+00:00,"@Gokul_Sahni Therein lies the trap of aligning with the West—your prosperity, dependent on Western technology and technology, is subject to adherence to their vision of the world. That vision, or “rules”, codifies Western dominance. Challenge that and you’re finished.",therein lies the trap of aligning with the westyour prosperity dependent on western technology and technology is subject to adherence to their vision of the world that vision or rules codifies western dominance challenge that and youre finished
2997,2022-10-04 12:40:08+00:00,@appare_noka 外で気を付けてね,
2998,2022-10-04 12:40:07+00:00,Putin’s war comes home to Russia | @MID_RF ☠️ #KILLPutinNOW ⚰️ &amp; all his #Oligarchs 🤑 + Kremlin Worms 🪱 &amp; DEAD Men Walking Yevgeny Prigozhin #YevgenyPrigozhin &amp; Ramzan Kadyrov #RamzanKadyrov https://t.co/TKtN39OJMU,putins war comes home to russia amp all his kremlin worms amp dead men walking yevgeny prigozhin amp ramzan kadyrov


In [4]:
clean_tweet("Pallelai bedokwalk no49.misslohahhuat and misskitty u masturbates for victims.this")

'pallelai bedokwalk no49misslohahhuat and misskitty u masturbates for victimsthis'