In [1]:
import pandas as pd

data = pd.read_csv('/Users/ferdianadham/Downloads/STKI/data.csv')

In [2]:
data = {
    'time' : data['time'],
    'tweet' : data['tweet']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
import re
def remove_links(text):
    url_pattern = re.compile(r'https:?//\S+|www.\S+|http:?//\S+|pic.twitter.com/\S+')
    
    cleaned_text = re.sub(url_pattern, '', text)
    
    return cleaned_text

In [5]:
df['tweet'] = df['tweet'].apply(remove_links)

In [6]:
def remove_tag_and_mention(text):
    pattern = re.compile(r'#\S+|@\S+')
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

In [7]:
df['tweet'] = df['tweet'].apply(remove_tag_and_mention)

In [8]:
def remove_symbol(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

In [9]:
df['tweet'] = df['tweet'].apply(remove_symbol)

In [10]:
df['tweet'] = df['tweet'].map(lambda x : x.lower() if isinstance(x, str) else x)

In [11]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stopword_factory = StopWordRemoverFactory()
stemmer_factory = StemmerFactory()

stopword = stopword_factory.create_stop_word_remover()
stemmer = stemmer_factory.create_stemmer()

df['tweet'] = [stopword.remove(''.join(tweet)) for tweet in df['tweet']]
df['tweet'] = [stemmer.stem(tweet) for tweet in df['tweet']]

In [12]:
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S')

df['time'] = df['time'].dt.time


In [13]:
from datetime import time

first_limit = time(6, 0, 0)
second_limit = time(12, 0, 0)
third_limit = time(18, 0, 0)
fourth_limit = time(23, 59, 59)

quarter_day_1 = df[(df['time'] >= time(0, 0, 0)) & (df['time'] <= first_limit)].copy()
quarter_day_2 = df[(df['time'] > first_limit) & (df['time'] <= second_limit)].copy()
quarter_day_3 = df[(df['time'] > second_limit) & (df['time'] <= third_limit)].copy()
quarter_day_4 = df[(df['time'] > third_limit) & (df['time'] <= fourth_limit)].copy()

In [15]:
# Analisis topik untuk kuartal kedua 22 September 2019
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
second_quarter_day = quarter_day_2['tweet'].tolist()


In [16]:
second_quarter_day_vect = vectorizer.fit_transform(second_quarter_day)
feature_names = vectorizer.get_feature_names_out()

In [31]:
from sklearn.decomposition import LatentDirichletAllocation

lda_2 = LatentDirichletAllocation(n_components=5, random_state=42, learning_method='batch',max_iter=20, evaluate_every=3,  n_jobs=-1)

lda_2.fit(second_quarter_day_vect)

In [32]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topik #{topic_idx + 1}:")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")

In [33]:
print(f"Topik yang Ditemukan ({5} Topik):")
display_topics(lda_2, feature_names, 5)

Topik yang Ditemukan (5 Topik):
Topik #1:
kakak cinta bukan bantu sekali
Topik #2:
panggil rasa rakyat segera bangkang
Topik #3:
done yxg kuy kelas soon
Topik #4:
besok percaya senin inih yogyakarta
Topik #5:
gejayan jadi 1998 2019 off




In [34]:
# Analisis topik untuk kuartal ketiga 22 September 2019
vectorizer3 = TfidfVectorizer()

third_quarter_day = quarter_day_3['tweet'].tolist()

In [35]:
third_quarter_day_vect = vectorizer3.fit_transform(third_quarter_day)
feature_names3 = vectorizer3.get_feature_names_out()

In [38]:
lda_3 = LatentDirichletAllocation(n_components=5, random_state=42, learning_method='batch',max_iter=20, evaluate_every=3,  n_jobs=-1)

lda_3.fit(third_quarter_day_vect)

In [39]:
print(f"Topik yang Ditemukan ({5} Topik):")
display_topics(lda_3, feature_names3, 5)

Topik yang Ditemukan (5 Topik):
Topik #1:
juang lawan panjang umur datang
Topik #2:
mahasiswa rakyat bukan besok suara
Topik #3:
baik indonesia sedang gejayan ikut
Topik #4:
jogja apa ikut jalan ga
Topik #5:
besok gejayan kelas gerak panggil




In [40]:
# Analisis topik untuk kuartal keempat 22 September 2019
vectorizer4 = TfidfVectorizer()

fourth_quarter_day = quarter_day_4['tweet'].tolist()

In [41]:
fourth_quarter_day_vect = vectorizer4.fit_transform(fourth_quarter_day)
feature_names4 = vectorizer4.get_feature_names_out()

In [42]:
lda_4 = LatentDirichletAllocation(n_components=5, random_state=42, learning_method='batch',max_iter=20, evaluate_every=3,  n_jobs=-1)

lda_4.fit(fourth_quarter_day_vect)

In [43]:
print(f"Topik yang Ditemukan ({5} Topik):")
display_topics(lda_4, feature_names4, 5)

Topik yang Ditemukan (5 Topik):
Topik #1:
panggil gejayan dikit hadir besok
Topik #2:
gas nih apa wkwk ken
Topik #3:
besok ikut yg aksi semangat
Topik #4:
siap panjang umur juang oke
Topik #5:
besok ayo gejayan mahasiswa dosen


