## Data Colletion from Twitter

In [4]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [5]:
from datetime import datetime, timedelta

def date_range(start, end):
    start_date = datetime.strptime(start, '%Y-%m-%d').date()
    end_date = datetime.strptime(end, '%Y-%m-%d').date()
    delta = end_date - start_date 
    days = [start_date + timedelta(days=i) for i in range(delta.days + 1)]
    return list(map(lambda n: n.strftime("%Y-%m-%d"), days))

data_collection_period = date_range('2022-08-01', '2022-09-01')

In [6]:
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\-…]',' ', temp)
    temp = re.sub('&amp;','and', temp)
    temp = re.sub("\n"," ", temp)
    temp = re.sub("\t"," ", temp)
    temp = re.sub("[^a-z0-9À-ž ]","", temp)
    temp = temp.split()
    temp = " ".join(word for word in temp)
    return temp
def get_most_recent_tweets(city, amount, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'content']]
    df_city = df_city.rename(columns={"content": "rawContent"})
    # df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{amount}_most_recent_tweets.csv")
    return df_city


def get_tweets_over_period(city, amount_per_day, data_collection_period, distance='20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[0]} until:{data_collection_period[1]} ').get_items(), amount_per_day))[['date', 'content']]
    for i in tqdm(range(1, len(data_collection_period)-1)):
        df_temp = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[i]} until:{data_collection_period[i+1]} ').get_items(), amount_per_day))[['date', 'content']]
        df_city = df_city.append(df_temp)
    # df_city = df_city.rename(columns={"content": "rawContent"})
    df_city.to_csv(f"data/{city}/{amount_per_day*len(data_collection_period)}_tweets_over_period.csv")
    return df_city

def clean_tweets_from_file(file_name):
    df_city = pd.read_csv(file_name, index_col=0)
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(file_name)
    return df_city


In [7]:
temp_df = clean_tweets_from_file('data/Singapore/tweets_over_period/24400_tweets_over_period.csv')

In [64]:
temp_df.sample(10)
# clean_tweet("I <3 u")

Unnamed: 0,date,rawContent,cleaned_content
1878,2022-10-04 12:42:24+00:00,faith,faith
696,2022-10-04 12:44:54+00:00,"NEW #Raiders @SNBToday EP. via @AudacySports \n\n-Other than feature Josh Jacobs, what else did the Raiders do to get the W?\n\n-@LVGully and I share our thoughts on the O-line combination \n\n-Thoughts on Darren Waller, Chandler Jones and Amik Robertson\n\n🎧⬇️\nhttps://t.co/gcTghMjY6X",new ep via other than feature josh jacobs what else did the raiders do to get the w and i share our thoughts on the oline combination thoughts on darren waller chandler jones and amik robertson
1307,2022-10-04 12:43:37+00:00,@Go_Blue1985 @SparklesNSunsh2 @Jillie_Alexis @TimRunsHisMouth I keep hearing this from MAGAs but cannot find a single article in Conservative media pointing to the complete &amp; absolute end to all global conflicts between 2016-2020. Can you point me to your sources?,i keep hearing this from magas but cannot find a single article in conservative media pointing to the complete and absolute end to all global conflicts between 20162020 can you point me to your sources
367,2022-10-04 12:45:30+00:00,@tosa05909757 I said slow the pace - not stop or pause. Monetary policy works with a lag of 6 to 18 months. We're in month 7 and the stress caused by strong dollar is serious. Slow to 50 bp plus QT is still a lot of tightening.,i said slow the pace not stop or pause monetary policy works with a lag of 6 to 18 months were in month 7 and the stress caused by strong dollar is serious slow to 50 bp plus qt is still a lot of tightening
1716,2022-10-04 12:42:46+00:00,@pulte I need a lambo to get my wife back #GME #MOON #SHFRFUCKED https://t.co/vgjEq0MYVx,i need a lambo to get my wife back
226,2022-10-04 12:45:50+00:00,"@SignalsToNoises Well, I did write that in the piece, so I agree :) but as I also write, you have to consider what it takes to get a movie greenlit. They don’t just appear out of thin air.",well i did write that in the piece so i agree but as i also write you have to consider what it takes to get a movie greenlit they dont just appear out of thin air
1859,2022-10-04 12:42:27+00:00,How can we help Iran?,how can we help iran
2705,2022-10-04 12:40:42+00:00,I just paid for the weather app… I’m officially an adult.,i just paid for the weather app im officially an adult
2412,2022-10-04 12:41:17+00:00,"@TomCurle @nathaliejacoby1 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 😂 You’re hilarious! Justice and the freedom to be is such divisive, hateful behavior!",youre hilarious justice and the freedom to be is such divisive hateful behavior
1203,2022-10-04 12:43:51+00:00,@sadie65 @jujutough @YasinShanks Fr. Bro a page one Stan 🤣,fr bro a page one stan


In [33]:
%%time
# get_most_recent_tweets("Singapore", 3000, distance='20km')
get_tweets_over_period("New York", 1000, data_collection_period)

KeyboardInterrupt: 

In [26]:
%%time
get_most_recent_tweets("New York", 3000, distance='20km')

CPU times: user 1.75 s, sys: 85 ms, total: 1.83 s
Wall time: 2min 4s


Unnamed: 0,date,rawContent,cleaned_content
0,2022-10-04 12:46:15+00:00,@ManotoNews خواهراش میگه که میخوان مث مادرش باشن\n#مهسا_امینی \n#اعتصابات_سراسری \n#OpIran,
1,2022-10-04 12:46:14+00:00,🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡🤡 https://t.co/U8HadmmAO8,
2,2022-10-04 12:46:14+00:00,Simply Be Well -Six Plant Based Body Soaps - Boxed Set NEW https://t.co/AmoTUCY5vY #eBay via @eBay,simply be well six plant based body soaps boxed set new via
3,2022-10-04 12:46:14+00:00,"$IDEX - Ideanomics, Solectrac secured new business contracts to supply electric tractors to major fleet operators https://t.co/3sOiHMyi9S",idex ideanomics solectrac secured new business contracts to supply electric tractors to major fleet operators
4,2022-10-04 12:46:14+00:00,@DrewOnline @reggaeology What petcentage of motorbikes on the road are legally licensed?,what petcentage of motorbikes on the road are legally licensed
...,...,...,...
2995,2022-10-04 12:40:08+00:00,@jeongsteph You’ve been manifesting this haha congrats @jeongsteph !!!,youve been manifesting this haha congrats
2996,2022-10-04 12:40:08+00:00,"@Gokul_Sahni Therein lies the trap of aligning with the West—your prosperity, dependent on Western technology and technology, is subject to adherence to their vision of the world. That vision, or “rules”, codifies Western dominance. Challenge that and you’re finished.",therein lies the trap of aligning with the westyour prosperity dependent on western technology and technology is subject to adherence to their vision of the world that vision or rules codifies western dominance challenge that and youre finished
2997,2022-10-04 12:40:08+00:00,@appare_noka 外で気を付けてね,
2998,2022-10-04 12:40:07+00:00,Putin’s war comes home to Russia | @MID_RF ☠️ #KILLPutinNOW ⚰️ &amp; all his #Oligarchs 🤑 + Kremlin Worms 🪱 &amp; DEAD Men Walking Yevgeny Prigozhin #YevgenyPrigozhin &amp; Ramzan Kadyrov #RamzanKadyrov https://t.co/TKtN39OJMU,putins war comes home to russia amp all his kremlin worms amp dead men walking yevgeny prigozhin amp ramzan kadyrov


## Exploratory Data Analysis

In [9]:
english_words_list = []

with open('english_words_corpora/english_corpus.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    list_of_csv = list(csv_reader)
    english_words_list = list_of_csv[0]      

# # sanity test for some english words
# list_english_words = list(english_words_set )
# print("sustainability" in list_english_words)

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
import csv

def write_to_csv(file_name, list_words):
    with open(f"{file_name}.csv", 'w', encoding='UTF8', newline='\n') as f:
        writer = csv.writer(f)
        writer.writerow(["word", "frequency"])
        for tup in list_words:
            writer.writerow(list(tup))

In [13]:
def generate_word_corpus(filename):
    corpus = []
    with open(filename) as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].strip().split(" ")
                corpus += all_words_in_sentence
    return corpus


def generate_word_frequency(city, approach, corpus):
    counter=Counter(corpus)
    most=counter.most_common()
    write_to_csv(f"data/{city}/{approach}_topMostFrequentWords", most[:100])
    write_to_csv(f"data/{city}/{approach}_topLeastFrequentWords", most[-100::])
    
    # non-English words
    non_english_words = []
    for tup in most:
        if (lemmatizer.lemmatize(tup[0]) not in english_words_set and tup[0].isnumeric() == False):
            non_english_words.append(tup)
    write_to_csv(f"data/{city}/{approach}_nonEnglishWords", non_english_words)

    
def average_sentence_length(city):
    total_length = 0
    count = 0
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].split(" ")
                total_length += len(all_words_in_sentence)
                count += 1
    print("Average sentence length:", total_length/count)

In [14]:
import os
def create_relative_english_frequency_col(df):
    def calculate_relative_frequency(content):
        english_word_bools = [lemmatizer.lemmatize(word) in english_words_set for word in str(content).split() if word.isalpha()]
        if not english_word_bools:
            return 0
        return english_word_bools.count(True)/len(english_word_bools) 
    df["english_relative_frequency"] = df.cleaned_content.apply(calculate_relative_frequency)
    return df

def filter_by_english_thresholds(min_thresh, max_thresh, original_dataset, create_file = False):
    output_dir = os.path.splitext(original_dataset)[0]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(original_dataset, "r") as infile:
        source_df = pd.read_csv(infile)
        out_df = create_relative_english_frequency_col(source_df)
        out_df = out_df[(min_thresh <= out_df["english_relative_frequency"]) & (out_df["english_relative_frequency"] < max_thresh)]
    
    if create_file: 
        with open(output_dir + f"/{min_thresh}_to_{max_thresh}_english_words.csv", "w") as outfile:
            outfile.write(out_df.to_csv())
            
    return out_df


In [15]:
for i in range(10):
    filter_by_english_thresholds(i/10, (i+1)/10, "data/Singapore/tweets_over_period/24400_tweets_over_period.csv", create_file = True)
filter_by_english_thresholds(1, 1.01, "data/Singapore/tweets_over_period/24400_tweets_over_period.csv")

NameError: name 'english_words_set' is not defined

In [66]:
filter_by_english_thresholds(0, 0.1, "data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets.csv")

Unnamed: 0.1,Unnamed: 0,date,rawContent,cleaned_content,english_relative_frequency
8,8,2022-09-26 13:21:00+00:00,Yasss,yasss,0.0
15,15,2022-09-26 13:20:53+00:00,HBD https://t.co/Jw8o7AE4Mw,hbd,0.0
20,20,2022-09-26 13:20:44+00:00,pls stfu oml 😂😂😂😂😂😂😂,pls stfu oml,0.0
23,23,2022-09-26 13:20:43+00:00,夕飯を茹で卵とプロテインでいけるかと思ったけど、3時間でお腹空いてきちゃった🥲はてどうしよう〜,3,0.0
29,29,2022-09-26 13:20:36+00:00,@nas_zaidi Hahaha,hahaha,0.0
37,37,2022-09-26 13:20:30+00:00,@Rekha51627292 @anbarasu_Boss @Kavin_m_0431 @KavinFansClub Unmai Dhaaney. Eeee😁🏃🏻 https://t.co/LRiejzWD3a,unmai dhaaney eeee,0.0
50,50,2022-09-26 13:20:21+00:00,先週久しぶりにヨーロッパに行って、周りの友人たちからの言動からESGについて真面目に考えてて、変化を感じた。正直経済成長している最中の国では優先順位は低いと思う,esg,0.0
58,58,2022-09-26 13:20:12+00:00,@ineorino @FWBESS Beban &amp; tuntutan,beban amp tuntutan,0.0
66,66,2022-09-26 13:20:01+00:00,新加坡武装部队今年同澳大利亚国防部队人员举行三叉戟演习，两国一共派出超过1600名部队人员参与，新加坡海军也首次出动两艘海军登陆舰。 https://t.co/bAaPuQrjJX,1600,0.0
68,68,2022-09-26 13:20:00+00:00,@sekaijii :DD,dd,0.0


In [88]:
# generate 3000 most recent tweets & analysis
singlish_most_recent_tweets_corpus = generate_word_corpus("data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets.csv")
generate_word_frequency("Singapore", "GenerateMostRecentTweets/3000_most_recent_tweets", singlish_most_recent_tweets_corpus)

In [83]:
# run once, generate cleaned tweets separately for 2nd approach to reduce wait time
df = pd.read_csv('data/Singapore/GenerateTweetsOverAPeriod/24400_tweets_over_period.csv', index_col=0)
df['cleaned_content'] = [clean_tweet(i) for i in df['rawContent']]
df.to_csv(f"data/Singapore/GenerateTweetsOverAPeriod/24400_cleaned_tweets_over_period.csv")

In [89]:
# generate analysis for tweets over period 2022-01-01 and 2022-09-01
singlish_over_period_tweets_corpus = generate_word_corpus(f"data/Singapore/GenerateTweetsOverAPeriod/{100*len(data_collection_period)}_cleaned_tweets_over_period.csv")
generate_word_frequency("Singapore", f"GenerateTweetsOverAPeriod/{100*len(data_collection_period)}_cleaned_tweets_over_period", singlish_over_period_tweets_corpus)