## Data Colletion from Twitter

In [1]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import numpy as np
import re
import csv
from collections import Counter
import nltk

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [90]:
from datetime import datetime, timedelta

def date_range(start, end):
    start_date = datetime.strptime(start, '%Y-%m-%d').date()
    end_date = datetime.strptime(end, '%Y-%m-%d').date()
    delta = end_date - start_date 
    days = [start_date + timedelta(days=i) for i in range(delta.days + 1)]
    return list(map(lambda n: n.strftime("%Y-%m-%d"), days))

data_collection_period = date_range('2022-01-01', '2022-09-01')

In [52]:
stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]

def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp


def get_most_recent_tweets(city, amount, distance = '20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance}').get_items(), amount))[['date', 'rawContent']]
    df_city['cleaned_content'] = [clean_tweet(i) for i in df_city['rawContent']]
    df_city.to_csv(f"data/{city}/{amount}_most_recent_tweets.csv")
    return df_city


def get_tweets_over_period(city, amount_per_day, data_collection_period, distance='20km'):
    df_city = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[0]} until:{data_collection_period[1]} ').get_items(), amount_per_day))[['date', 'rawContent']]
    for i in range(2, len(data_collection_period)-1):
        df_temp = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(f'near:"{city}" within:{distance} since:{data_collection_period[i]} until:{data_collection_period[i+1]} ').get_items(), amount_per_day))[['date', 'rawContent']]
        df_city = df_city.append(df_temp)
    df_city.to_csv(f"data/{city}/{amount_per_day*len(data_collection_period)}_tweets_over_period.csv")
    return df_city


In [53]:
%%time
get_most_recent_tweets("Singapore", 3000, distance='20km')
get_tweets_over_period("Singapore", 100, data_collection_period)

Unsupported unified_card type on tweet 1501659063037403136


CPU times: user 26.7 s, sys: 2.28 s, total: 29 s
Wall time: 23min 50s


Unnamed: 0,date,rawContent
0,2022-01-01 23:59:27+00:00,洋服3着とサンダルとアイシャドウを年末買って1日に下ろしたら、ぱりっとお正月気分に🎍\nいつぶり？という4連休。ぐーーーっすり寝て、ようやく復活✨
1,2022-01-01 23:59:01+00:00,100%
2,2022-01-01 23:58:48+00:00,Just posted a photo @ Our Tampines Hub https://t.co/QjJN4MJnfB
3,2022-01-01 23:58:29+00:00,Just posted a photo @ Our Tampines Hub https://t.co/QUz8UuK2Sv
4,2022-01-01 23:58:15+00:00,@ShibinMeta Eth okay
...,...,...
95,2022-08-31 22:44:03+00:00,@Mohamed08334779 Masha allah
96,2022-08-31 22:42:29+00:00,NOPE https://t.co/G4YfsotYO0
97,2022-08-31 22:41:02+00:00,@MattLegal13 @bod_republic Haha agak lama sudah 😭
98,2022-08-31 22:40:59+00:00,Hold on tight to the ones you love. Life is fleeting.


## Exploratory Data Analysis

In [54]:
stopwords = nltk.corpus.stopwords.words('english')
english_words_set = set(nltk.corpus.words.words())

# # sanity test for some english words
# list_english_words = list(english_words_set )
# print("sustainability" in list_english_words)

In [55]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [56]:
import csv

def write_to_csv(file_name, list_words):
    with open(f"{file_name}.csv", 'w', encoding='UTF8', newline='\n') as f:
        writer = csv.writer(f)
        writer.writerow(["word", "frequency"])
        for tup in list_words:
            writer.writerow(list(tup))

In [87]:
def generate_word_corpus(filename):
    corpus = []
    with open(filename) as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].strip().split(" ")
                corpus += all_words_in_sentence
    return corpus


def generate_word_frequency(city, approach, corpus):
    counter=Counter(corpus)
    most=counter.most_common()
    write_to_csv(f"data/{city}/{approach}_topMostFrequentWords", most[:100])
    write_to_csv(f"data/{city}/{approach}_topLeastFrequentWords", most[-100:-1])
    
    # non-English words
    non_english_words = []
    for tup in most:
        if (lemmatizer.lemmatize(tup[0]) not in english_words_set and tup[0].isnumeric() == False):
            non_english_words.append(tup)
    write_to_csv(f"data/{city}/{approach}_nonEnglishWords", non_english_words)

    
def average_sentence_length(city):
    total_length = 0
    count = 0
    with open(f"data/{city}.csv") as file:
        csvreader = csv.reader(file)
        next(csvreader)
        for row in csvreader:
            if len(row) > 3:
                all_words_in_sentence = row[3].split(" ")
                total_length += len(all_words_in_sentence)
                count += 1
    print("Average sentence length:", total_length/count)

In [88]:
# generate 3000 most recent tweets & analysis
singlish_most_recent_tweets_corpus = generate_word_corpus("data/Singapore/GenerateMostRecentTweets/3000_most_recent_tweets.csv")
generate_word_frequency("Singapore", "GenerateMostRecentTweets/3000_most_recent_tweets", singlish_most_recent_tweets_corpus)

In [83]:
# run once, generate cleaned tweets separately for 2nd approach to reduce wait time
df = pd.read_csv('data/Singapore/GenerateTweetsOverAPeriod/24400_tweets_over_period.csv', index_col=0)
df['cleaned_content'] = [clean_tweet(i) for i in df['rawContent']]
df.to_csv(f"data/Singapore/GenerateTweetsOverAPeriod/24400_cleaned_tweets_over_period.csv")

In [89]:
# generate analysis for tweets over period 2022-01-01 and 2022-09-01
singlish_over_period_tweets_corpus = generate_word_corpus(f"data/Singapore/GenerateTweetsOverAPeriod/{100*len(data_collection_period)}_cleaned_tweets_over_period.csv")
generate_word_frequency("Singapore", f"GenerateTweetsOverAPeriod/{100*len(data_collection_period)}_cleaned_tweets_over_period", singlish_over_period_tweets_corpus)