# Data Gathering

## Section 1: Let's collect some tweets!

In [9]:
import snscrape.modules.twitter as TwitterScraper
import pandas as pd

import re
import itertools

from datetime import datetime as dt

In [10]:
# Data Requirements:

keyword = "Pertamina"
since = "2022-09-03"
until = "2022-09-05"
lang = "in" # Search only in Region Indonesia
limit = 15000

query = f"+{keyword} since:{since} until:{until} lang:{lang}"

In [11]:
# Get Tweets

startTime = dt.now().replace(microsecond=0)

print(f"Searching for tweets that fit with the query: '{query}'")
tweets = TwitterScraper.TwitterSearchScraper(query).get_items()
sliced_tweets = itertools.islice(tweets, limit)

df = pd.DataFrame(sliced_tweets)

endTime = dt.now().replace(microsecond=0)
durationTime = endTime - startTime

hours, remainder = divmod(durationTime.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Search completed in {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds")
print(df.shape)

Searching for tweets that fit with the query: '+Pertamina since:2022-09-03 until:2022-09-05 lang:in'


Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%2BPertamina+since%3A2022-09-03+until%3A2022-09-05+lang%3Ain&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&in

ScraperException: 4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%2BPertamina+since%3A2022-09-03+until%3A2022-09-05+lang%3Ain&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.

In [None]:
df.head()

def removeNewLineAndQuotes(tweet):
	tweet = re.sub(r'[\n\\"\'\`]+', ' ', tweet)
	return tweet

df['content'] = df['content'].apply(removeNewLineAndQuotes)
df['renderedContent'] = df['renderedContent'].apply(removeNewLineAndQuotes)
df.head()

df.to_csv(f"../dataset/raw_tweets.csv", header=True, index=False, sep="`")

## Section 2: Collect Lexicon Dataset

In [None]:
# Download from https://github.com/fajri91/InSet

indoset_lexicon_neg_url = "https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv"
indoset_lexicon_pos_url = "https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv"
vulgarity_lexicon_url = "https://raw.githubusercontent.com/onlyphantom/elangdev/master/elang/word2vec/utils/swear-words.txt"

indoset_lexicon_neg = pd.read_csv(indoset_lexicon_neg_url, sep="\t")
indoset_lexicon_pos = pd.read_csv(indoset_lexicon_pos_url, sep="\t")
vulgarity_lexicon = pd.read_csv(vulgarity_lexicon_url, sep="\t", header=None)

In [None]:
print(f"Total lexicon neg:", len(indoset_lexicon_neg))
print(f"Total lexicon pos:", len(indoset_lexicon_pos))
print(f"Total lexicon vulgarity:", len(vulgarity_lexicon))

indoset_lexicon_neg.to_csv("../dataset/wordlist/indoset_lexicon_neg.csv", header=True, index=False, sep="\t")
indoset_lexicon_pos.to_csv("../dataset/wordlist/indoset_lexicon_pos.csv", header=True, index=False, sep="\t")
vulgarity_lexicon.to_csv("../dataset/wordlist/vulgarity_lexicon.csv", header=None, index=False, sep="\t")

Total lexicon neg: 6609
Total lexicon pos: 3609
Total lexicon vulgarity: 62
