# Data Gathering

## Section 1: Let's collect some tweets!

In [2]:
import snscrape.modules.twitter as TwitterScraper
import pandas as pd

import re
import itertools

from datetime import datetime as dt

In [9]:
# Data Requirements:

keyword = "Pertamina"
since = "2022-09-03"
until = "2022-09-05"
lang = "in" # Search only in Region Indonesia
limit = 15000

query = f"+{keyword} since:{since} until:{until} lang:{lang}"

In [10]:
# Get Tweets

startTime = dt.now().replace(microsecond=0)

print(f"Searching for tweets that fit with the query: '{query}'")
tweets = TwitterScraper.TwitterSearchScraper(query).get_items()
sliced_tweets = itertools.islice(tweets, limit)

df = pd.DataFrame(sliced_tweets)

endTime = dt.now().replace(microsecond=0)
durationTime = endTime - startTime

hours, remainder = divmod(durationTime.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Search completed in {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds")
print(df.shape)

Searching for tweets that fit with the query: '+Pertamina since:2022-09-03 until:2022-09-05 lang:in'
Search completed in 0 hours, 7 minutes, and 10 seconds
(7502, 27)


In [11]:
df.head()

def removeNewLineAndQuotes(tweet):
	tweet = re.sub(r'[\n\\"\'\`]+', ' ', tweet)
	return tweet

df['content'] = df['content'].apply(removeNewLineAndQuotes)
df['renderedContent'] = df['renderedContent'].apply(removeNewLineAndQuotes)
df.head()

df.to_csv(f"../dataset/raw.{query}.csv", header=True, index=False, sep="`")

## Section 2: Collect Lexicon Dataset

In [18]:
# Download from https://github.com/fajri91/InSet

indoset_lexicon_neg_url = "https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv"
indoset_lexicon_pos_url = "https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv"
vulgarity_lexicon_url = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/swear-words.txt"

indoset_lexicon_neg = pd.read_csv(indoset_lexicon_neg_url, sep="\t")
indoset_lexicon_pos = pd.read_csv(indoset_lexicon_pos_url, sep="\t")
vulgarity_lexicon = pd.read_csv(vulgarity_lexicon_url, sep="\t", header=None)

In [19]:
print(f"Total lexicon neg:", len(indoset_lexicon_neg))
print(f"Total lexicon pos:", len(indoset_lexicon_pos))
print(f"Total lexicon vulgarity:", len(vulgarity_lexicon))

indoset_lexicon_neg.to_csv("../dataset/wordlist/indoset_lexicon_neg.csv", header=True, index=False, sep="\t")
indoset_lexicon_pos.to_csv("../dataset/wordlist/indoset_lexicon_pos.csv", header=True, index=False, sep="\t")
vulgarity_lexicon.to_csv("../dataset/wordlist/vulgarity_lexicon.csv", header=None, index=False, sep="\t")

Total lexicon neg: 6609
Total lexicon pos: 3609
Total lexicon vulgarity: 62
