## Data Colletion from Twitter

In [2]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]

def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp


def get_data(city, amount, approach=None, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'rawContent']]
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{approach}.csv")
    return df_city

In [4]:
%%time
get_data("Singapore", 1000, "1000_most_recent_tweets")

CPU times: user 821 ms, sys: 73.8 ms, total: 895 ms
Wall time: 43.5 s


Unnamed: 0,date,rawContent,cleaned_content
0,2022-09-26 10:42:19+00:00,@telbuz @ThatAlexWoman Evidence?,evidence
1,2022-09-26 10:42:17+00:00,Fundraising links:\nhttps://t.co/SDmPt9Z7jI\n\n#도경수 #DohKyungSoo\n#BadProsecutor #진검승부,fundraising links
2,2022-09-26 10:42:16+00:00,Got a new haircut.. I need opinions..\nIssit hair reveal time? („• ֊ •„),got new haircut i need opinions issit hair reveal time
3,2022-09-26 10:42:15+00:00,No wasep. No call,no wasep no call
4,2022-09-26 10:42:15+00:00,@112xday “Owhkeigh cute voice \nNot pulling. “,owhkeigh cute voice not pulling
...,...,...,...
995,2022-09-26 10:26:42+00:00,@theboypostman Konje which trap joint did he make before roll up?,konje which trap joint did he make before roll up
996,2022-09-26 10:26:40+00:00,@rlgxg Tanya mimin besnya gih,tanya mimin besnya gih
997,2022-09-26 10:26:40+00:00,A big thank you to all for sending your beautiful photos for the #APECPhotoContest2022. 👏 We are excited and look forward to featuring the top 10 photos in the coming weeks. Keep your eyes peeled for more updates on our social media pages and our website!📱 https://t.co/FZfCZm3J5o,big thank you all sending your beautiful photos we are excited look forward featuring top 10 photos coming weeks keep your eyes peeled more updates our social media pages our website
998,2022-09-26 10:26:40+00:00,Tadi malam mabok mabok an https://t.co/g7qUCc7FvD,tadi malam mabok mabok


## Exploratory Data Analysis

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
english_words_set = set(nltk.corpus.words.words())

# # sanity test for some english words
# list_english_words = list(english_words_set )
# print("sustainability" in list_english_words)

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
import csv

def write_to_csv(file_name, list_words):
    with open(f"{file_name}.csv", 'w', encoding='UTF8', newline='\n') as f:
        writer = csv.writer(f)
        writer.writerow(["word", "frequency"])
        for tup in list_words:
            writer.writerow(list(tup))

In [None]:
def generate_word_corpus(city):
    corpus = []
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].strip().split(" ")
                corpus += all_words_in_sentence
    return corpus


def generate_word_frequency(city, approach, corpus):
    counter=Counter(corpus)
    most=counter.most_common()
    write_to_csv(f"data/{city}/{approach}_topMostFrequentWords", most[:100])
    write_to_csv(f"data/{city}/{approach}_topLeastFrequentWords", most[:-100])
    
    # non-English words
    non_english_words = []
    for tup in most:
        if (lemmatizer.lemmatize(tup[0]) not in english_words_set and tup[0].isnumeric() == False):
            non_english_words.append(tup)
    write_to_csv(f"data/{city}/{approach}_nonEnglishWords", non_english_words)

    
def average_sentence_length(city):
    total_length = 0
    count = 0
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].split(" ")
                total_length += len(all_words_in_sentence)
                count += 1
    print("Average sentence length:", total_length/count)
        

singlish_corpus = generate_word_corpus("Singapore")
generate_word_frequency("Singapore", "1000_most_recent_tweets", singlish_corpus)