## Data Colletion from Twitter

In [1]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [2]:
from datetime import datetime, timedelta

def date_range(start, end):
    start_date = datetime.strptime(start, '%Y-%m-%d').date()
    end_date = datetime.strptime(end, '%Y-%m-%d').date()
    delta = end_date - start_date 
    days = [start_date + timedelta(days=i) for i in range(delta.days + 1)]
    return list(map(lambda n: n.strftime("%Y-%m-%d"), days))

data_collection_period = date_range('2022-08-01', '2022-09-01')

In [3]:
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\-…]',' ', temp)
    temp = re.sub('&amp;','and', temp)
    temp = re.sub("\n"," ", temp)
    temp = re.sub("\t"," ", temp)
    temp = re.sub("[^a-z0-9À-ž ]","", temp)
    temp = temp.split()
    temp = " ".join(word for word in temp)
    return temp
def get_most_recent_tweets(city, amount, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'content']]
    df_city = df_city.rename(columns={"content": "rawContent"})
    # df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{amount}_most_recent_tweets.csv")
    return df_city


def get_tweets_over_period(city, amount_per_day, data_collection_period, distance='20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[0]} until:{data_collection_period[1]} ').get_items(), amount_per_day))[['date', 'content']]
    for i in tqdm(range(1, len(data_collection_period)-1)):
        df_temp = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[i]} until:{data_collection_period[i+1]} ').get_items(), amount_per_day))[['date', 'content']]
        df_city = df_city.append(df_temp)
    # df_city = df_city.rename(columns={"content": "rawContent"})
    df_city.to_csv(f"data/{city}/{amount_per_day*len(data_collection_period)}_tweets_over_period.csv")
    return df_city

def clean_tweets_from_file(file_name):
    df_city = pd.read_csv(file_name, index_col=[0])
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city['id'] = range(1, len(df_city)+1)
    df_city = df_city[['id'] + [x for x in df_city.columns if x != 'id']]
    df_city.set_index('id')
    df_city.to_csv(file_name)
    return df_city


In [71]:
temp_df = clean_tweets_from_file('data/Singapore/tweets_over_period/24400_tweets_over_period.csv')

In [72]:
temp_df.head(150)
# clean_tweet("I <3 u")

Unnamed: 0,id,date,rawContent,cleaned_content
0,1,2022-01-01 23:59:27+00:00,洋服3着とサンダルとアイシャドウを年末買って1日に下ろしたら、ぱりっとお正月気分に🎍\nいつぶり？という4連休。ぐーーーっすり寝て、ようやく復活✨,31 4
1,2,2022-01-01 23:59:01+00:00,100%,100
2,3,2022-01-01 23:58:48+00:00,Just posted a photo @ Our Tampines Hub https://t.co/QjJN4MJnfB,just posted a photo our tampines hub
3,4,2022-01-01 23:58:29+00:00,Just posted a photo @ Our Tampines Hub https://t.co/QUz8UuK2Sv,just posted a photo our tampines hub
4,5,2022-01-01 23:58:15+00:00,@ShibinMeta Eth okay,eth okay
5,6,2022-01-01 23:58:13+00:00,Just posted a photo @ Our Tampines Hub https://t.co/pq9J6zYBuC,just posted a photo our tampines hub
6,7,2022-01-01 23:58:02+00:00,@GGebunny Oh you gym at westgate too,oh you gym at westgate too
7,8,2022-01-01 23:57:20+00:00,@Siennafrst I survived another year without travel. In the 270 square miles of Singapore no less.,i survived another year without travel in the 270 square miles of singapore no less
8,9,2022-01-01 23:56:08+00:00,#とき宣お正月 #超ときめき宣伝部 \n??? https://t.co/UZA8k8PSi9,
9,10,2022-01-01 23:55:49+00:00,おはようございます😊\n今日は、これから息子の学校のペンキ塗りと荷物運び❗\nお正月ってどこいった。\n雨がしとしと肌寒いが行ってきます‼️エイエイオー。,


In [33]:
%%time
# get_most_recent_tweets("Singapore", 3000, distance='20km')
get_tweets_over_period("New York", 1000, data_collection_period)

KeyboardInterrupt: 

In [26]:
%%time
get_most_recent_tweets("New York", 3000, distance='20km')

CPU times: user 1.75 s, sys: 85 ms, total: 1.83 s
Wall time: 2min 4s


Unnamed: 0,date,rawContent,cleaned_content
0,2022-10-04 12:46:15+00:00,@ManotoNews خواهراش میگه که میخوان مث مادرش باشن\n#مهسا_امینی \n#اعتصابات_سراسری \n#OpIran,
1,2022-10-04 12:46:14+00:00,🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡 https://t.co/U8HadmmAO8,
2,2022-10-04 12:46:14+00:00,Simply Be Well -Six Plant Based Body Soaps - Boxed Set NEW https://t.co/AmoTUCY5vY #eBay via @eBay,simply be well six plant based body soaps boxed set new via
3,2022-10-04 12:46:14+00:00,"$IDEX - Ideanomics, Solectrac secured new business contracts to supply electric tractors to major fleet operators https://t.co/3sOiHMyi9S",idex ideanomics solectrac secured new business contracts to supply electric tractors to major fleet operators
4,2022-10-04 12:46:14+00:00,@DrewOnline @reggaeology What petcentage of motorbikes on the road are legally licensed?,what petcentage of motorbikes on the road are legally licensed
...,...,...,...
2995,2022-10-04 12:40:08+00:00,@jeongsteph You’ve been manifesting this haha congrats @jeongsteph !!!,youve been manifesting this haha congrats
2996,2022-10-04 12:40:08+00:00,"@Gokul_Sahni Therein lies the trap of aligning with the West—your prosperity, dependent on Western technology and technology, is subject to adherence to their vision of the world. That vision, or “rules”, codifies Western dominance. Challenge that and you’re finished.",therein lies the trap of aligning with the westyour prosperity dependent on western technology and technology is subject to adherence to their vision of the world that vision or rules codifies western dominance challenge that and youre finished
2997,2022-10-04 12:40:08+00:00,@appare_noka 外で気を付けてね,
2998,2022-10-04 12:40:07+00:00,Putin’s war comes home to Russia | @MID_RF ☠️ #KILLPutinNOW ⚰️ &amp; all his #Oligarchs 🤑 + Kremlin Worms 🪱 &amp; DEAD Men Walking Yevgeny Prigozhin #YevgenyPrigozhin &amp; Ramzan Kadyrov #RamzanKadyrov https://t.co/TKtN39OJMU,putins war comes home to russia amp all his kremlin worms amp dead men walking yevgeny prigozhin amp ramzan kadyrov


In [4]:
clean_tweet("Pallelai bedokwalk no49.misslohahhuat and misskitty u masturbates for victims.this")

'pallelai bedokwalk no49misslohahhuat and misskitty u masturbates for victimsthis'