# Data Gathering

## Section 1: Let's collect some tweets!

In [1]:
import snscrape.modules.twitter as TwitterScraper
import pandas as pd

import re
import itertools

from datetime import datetime as dt

In [2]:
# Data Requirements:

keyword = "Pertamina"
since = "2022-09-06"
until = "2022-09-07"
lang = "in" # Search only in Region Indonesia
limit = 1000

query = f"+{keyword} since:{since} until:{until} lang:{lang}"

In [3]:
# Get Tweets

startTime = dt.now().replace(microsecond=0)

print(f"Searching for tweets that fit with the query: '{query}'")
tweets = TwitterScraper.TwitterSearchScraper(query).get_items()
sliced_tweets = itertools.islice(tweets, limit)

df = pd.DataFrame(sliced_tweets)

endTime = dt.now().replace(microsecond=0)
durationTime = endTime - startTime

hours, remainder = divmod(durationTime.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Search completed in {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds")
print(df.shape)

Searching for tweets that fit with the query: '+Pertamina since:2022-09-06 until:2022-09-07 lang:in'
Search completed in 0 hours, 0 minutes, and 53 seconds
(1000, 29)


In [4]:
df.head()

Unnamed: 0,url,date,rawContent,renderedContent,id,user,replyCount,retweetCount,likeCount,quoteCount,...,inReplyToTweetId,inReplyToUser,mentionedUsers,coordinates,place,hashtags,cashtags,card,viewCount,vibe
0,https://twitter.com/DayangIndah4/status/156730...,2022-09-06 23:58:53+00:00,@elkrova Pak suruh calon\nPresiden\nYang banya...,@elkrova Pak suruh calon\nPresiden\nYang banya...,1567301343479726080,"{'username': 'DayangIndah4', 'id': 13761729276...",1,0,1,0,...,1.5673e+18,"{'username': 'elkrova', 'id': 1537930110, 'dis...","[{'username': 'elkrova', 'id': 1537930110, 'di...",,,,,,,
1,https://twitter.com/HMI_CabangJogja/status/156...,2022-09-06 23:58:34+00:00,Jalan terakhir telah ditetapkan. Jokowi menggu...,Jalan terakhir telah ditetapkan. Jokowi menggu...,1567301260445097985,"{'username': 'HMI_CabangJogja', 'id': 14961629...",1,0,0,0,...,1.567301e+18,"{'username': 'HMI_CabangJogja', 'id': 14961629...",,,,,,,,
2,https://twitter.com/NoorTendy/status/156730096...,2022-09-06 23:57:24+00:00,@msaid_didu @pertamina @IndonesiaGaruda Instin...,@msaid_didu @pertamina @IndonesiaGaruda Instin...,1567300967359729665,"{'username': 'NoorTendy', 'id': 13914930363795...",0,0,0,0,...,1.566946e+18,"{'username': 'msaid_didu', 'id': 1117990249806...","[{'username': 'msaid_didu', 'id': 111799024980...",,,,,,,
3,https://twitter.com/C4h_J4ncuk/status/15673006...,2022-09-06 23:56:17+00:00,Kelakuan Jokontol (Joko berotak kont0l)... \n\...,Kelakuan Jokontol (Joko berotak kont0l)... \n\...,1567300688677605376,"{'username': 'C4h_J4ncuk', 'id': 1355178153782...",0,0,1,0,...,,,,,,,,,,
4,https://twitter.com/denisetiawan139/status/156...,2022-09-06 23:54:38+00:00,@_ekokuntadhi Gw heran pertamina bisa rugi.pad...,@_ekokuntadhi Gw heran pertamina bisa rugi.pad...,1567300271738597376,"{'username': 'denisetiawan139', 'id': 31904177...",0,0,0,0,...,1.567005e+18,"{'username': '_ekokuntadhi', 'id': 198412976, ...","[{'username': '_ekokuntadhi', 'id': 198412976,...","{'longitude': 106.707382, 'latitude': -6.626402}","{'id': 'b5b74007b9768111', 'fullName': 'Dramag...",,,,,


In [11]:
def removeNewLineAndQuotes(tweet):
	tweet = re.sub(r'[\n\\"\'\`]+', ' ', tweet)
	return tweet

df['rawContent'] = df['rawContent'].apply(removeNewLineAndQuotes)
df['content'] = df['rawContent']
df['renderedContent'] = df['renderedContent'].apply(removeNewLineAndQuotes)
df.head()

del df['rawContent']

df.to_csv(f"../dataset/raw/raw_tweets_test.csv", header=True, index=False, sep="`")

## Section 2: Collect Lexicon Dataset

In [None]:
# Download from https://github.com/fajri91/InSet

indoset_lexicon_neg_url = "https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv"
indoset_lexicon_pos_url = "https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv"
vulgarity_lexicon_url = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/swear-words.txt"

indoset_lexicon_neg = pd.read_csv(indoset_lexicon_neg_url, sep="\t")
indoset_lexicon_pos = pd.read_csv(indoset_lexicon_pos_url, sep="\t")
vulgarity_lexicon = pd.read_csv(vulgarity_lexicon_url, sep="\t", header=None)

In [None]:
print(f"Total lexicon neg:", len(indoset_lexicon_neg))
print(f"Total lexicon pos:", len(indoset_lexicon_pos))
print(f"Total lexicon vulgarity:", len(vulgarity_lexicon))

indoset_lexicon_neg.to_csv("../dataset/wordlist/indoset_lexicon_neg.csv", header=True, index=False, sep="\t")
indoset_lexicon_pos.to_csv("../dataset/wordlist/indoset_lexicon_pos.csv", header=True, index=False, sep="\t")
vulgarity_lexicon.to_csv("../dataset/wordlist/vulgarity_lexicon.csv", header=None, index=False, sep="\t")

Total lexicon neg: 6609
Total lexicon pos: 3609
Total lexicon vulgarity: 62
