## Data Colletion from Twitter

In [1]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [72]:
stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]

def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp


def get_data(city, amount, approach=None, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'rawContent']]
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{approach}.csv")
    return df_city

In [50]:
%%time
get_data("Singapore", 1000, "1000_most_recent_tweets")

CPU times: user 843 ms, sys: 92.4 ms, total: 936 ms
Wall time: 45 s


Unnamed: 0,date,rawContent,cleaned_content
0,2022-09-26 09:57:29+00:00,@Otterverse_NFT @abstractnouveau @WhyorJ @airsg1 @liberxtor2502,
1,2022-09-26 09:57:27+00:00,Alo ges https://t.co/WdLl0peEBZ,alo ges
2,2022-09-26 09:57:27+00:00,@Xynonetix Hbd 🥳,hbd
3,2022-09-26 09:57:25+00:00,@msianinsight Nope.. jgn pandang belakang 'Lawan Tetap Lawan',nope jgn pandang belakang lawan tetap lawan
4,2022-09-26 09:57:22+00:00,pochu,pochu
...,...,...,...
995,2022-09-26 09:39:14+00:00,@AbudiAlsagoff So always pegang ayat ni “Don't judge a book by its cover”!,so always pegang ayat ni dont judge book by its cover
996,2022-09-26 09:39:13+00:00,"@L_V_N_K @TheFracture_ @Bulldog6666_ @merelygravel @NyuFyuTr Man such a great decision, I love the binaries and Medusa. But all are amazing! https://t.co/xSogX16rNR",man such great decision i love binaries medusa but all are amazing
997,2022-09-26 09:39:11+00:00,Awww my favourite sport! Thanks @JJJJackyWoo for the gift😭🥰 love it!!! https://t.co/QJ8BqKi2f4,awww my favourite sport thanks gift love it
998,2022-09-26 09:39:11+00:00,@cthajar @hafizfiz18 Mak dia tahu tak anak dia masuk sana sebab apa?,mak dia tahu tak anak dia masuk sana sebab apa


## Exploratory Data Analysis

In [69]:
stopwords = nltk.corpus.stopwords.words('english')
english_words_set = set(nltk.corpus.words.words())

# # sanity test for some english words
# list_english_words = list(english_words_set )
# print("sustainability" in list_english_words)

In [36]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [63]:
import csv

def write_to_csv(file_name, list_words):
    with open(f"{file_name}.csv", 'w', encoding='UTF8', newline='\n') as f:
        writer = csv.writer(f)
        writer.writerow(["word", "frequency"])
        for tup in list_words:
            writer.writerow(list(tup))

In [71]:
def generate_word_corpus(city):
    corpus = []
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].strip().split(" ")
                corpus += all_words_in_sentence
    return corpus


def generate_word_frequency(city, approach, corpus):
    counter=Counter(corpus)
    most=counter.most_common()
    print("Top most frequent words:", most[:100], '\n')
    print("Top least frequent words:", most[-100:], '\n')
    write_to_csv(f"data/{city}/{approach}_topMostFrequentWords", most[:100])
    write_to_csv(f"data/{city}/{approach}_topLeastFrequentWords", most[:-100])
    
    # non-English words
    non_english_words = []
    for tup in most:
        if (lemmatizer.lemmatize(tup[0]) not in english_words_set and tup[0].isnumeric() == False):
            print("Not in english words set:", tup)
            non_english_words.append(tup)
    write_to_csv(f"data/{city}/{approach}_nonEnglishWords", non_english_words)

    
def average_sentence_length(city):
    total_length = 0
    count = 0
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].split(" ")
                total_length += len(all_words_in_sentence)
                count += 1
    print("Average sentence length:", total_length/count)
        

singlish_corpus = generate_word_corpus("Singapore")
generate_word_frequency("Singapore", "1000_most_recent_tweets", singlish_corpus)

Top most frequent words: [('', 153), ('i', 139), ('is', 107), ('you', 97), ('it', 81), ('this', 67), ('that', 60), ('are', 49), ('best', 43), ('at', 41), ('with', 40), ('youre', 39), ('my', 39), ('but', 39), ('sa', 38), ('pie', 37), ('cardagulan', 35), ('witheian', 35), ('no', 34), ('can', 34), ('so', 33), ('its', 33), ('just', 33), ('all', 32), ('not', 32), ('like', 32), ('be', 31), ('me', 30), ('s', 30), ('if', 29), ('ni', 27), ('will', 27), ('we', 26), ('t', 26), ('tak', 25), ('your', 25), ('im', 25), ('dia', 25), ('how', 24), ('what', 24), ('out', 23), ('nak', 23), ('do', 23), ('yang', 23), ('have', 22), ('one', 20), ('up', 20), ('as', 20), ('by', 20), ('our', 20), ('done', 20), ('when', 19), ('good', 18), ('yg', 18), ('aku', 18), ('why', 17), ('day', 17), ('was', 16), ('about', 16), ('love', 16), ('more', 16), ('thanks', 16), ('would', 16), ('he', 16), ('amp', 15), ('1', 15), ('dan', 15), ('time', 15), ('has', 15), ('kita', 15), ('ever', 14), ('apa', 14), ('there', 14), ('them', 1

Not in english words set: ('kontak', 1)
Not in english words set: ('malasnyaaaaaa', 1)
Not in english words set: ('kene', 1)
Not in english words set: ('fourty', 1)
Not in english words set: ('poni', 1)
Not in english words set: ('vadhilei', 1)
Not in english words set: ('sanagathi', 1)
Not in english words set: ('enti', 1)
Not in english words set: ('maku', 1)
Not in english words set: ('eni', 1)
Not in english words set: ('vachayi', 1)
Not in english words set: ('entha', 1)
Not in english words set: ('bjp', 1)
Not in english words set: ('pk', 1)
Not in english words set: ('andaru', 1)
Not in english words set: ('kalisisthe', 1)
Not in english words set: ('gelcharu', 1)
Not in english words set: ('unko', 1)
Not in english words set: ('vishayam', 1)
Not in english words set: ('epati', 1)
Not in english words set: ('dhaka', 1)
Not in english words set: ('sengar', 1)
Not in english words set: ('gelavaledu', 1)
Not in english words set: ('sti', 1)
Not in english words set: ('banned', 1)
N

Not in english words set: ('msm', 1)
Not in english words set: ('onzzz', 1)
Not in english words set: ('podcast', 1)
Not in english words set: ('infantilising', 1)
Not in english words set: ('nuanced', 1)
Not in english words set: ('3am', 1)
Not in english words set: ('attracted', 1)
Not in english words set: ('coooooooool', 1)
Not in english words set: ('hahhaa', 1)
Not in english words set: ('tuuu', 1)
Not in english words set: ('menahan', 1)
Not in english words set: ('lawa', 1)
Not in english words set: ('takda', 1)
Not in english words set: ('logo', 1)
Not in english words set: ('babie', 1)
Not in english words set: ('iki', 1)
Not in english words set: ('kecewa', 1)
Not in english words set: ('tiap', 1)
Not in english words set: ('tuku', 1)
Not in english words set: ('jus', 1)
Not in english words set: ('jambu', 1)
Not in english words set: ('terlalu', 1)
Not in english words set: ('bening', 1)
Not in english words set: ('trus', 1)
Not in english words set: ('pengen', 1)
Not in en