<a href="https://colab.research.google.com/github/naufalfano/topicmodelling_LDA/blob/main/topicmodelling_lda_itsfess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleansing & Pre-Processing

In [None]:
#Upload file local ke dalam Google Colab
from google.colab import files
uploaded = files.upload()

In [None]:
#Membaca file yang telah diupload
import pandas as pd
df = pd.read_excel('twit_fulltext.xlsx')

first_10_rows = df.head(10)
print(first_10_rows)

**Convert to lowercase**

In [None]:
# Mengubah semua text menjadi lowercase
df['full_text'] = df['full_text'].str.lower()

first_10_rows = df.head(10)
print(first_10_rows)

**Clear URL**

In [None]:
import re

# Mendefinisikan Regex untuk mengidentifikasi pattern URL
pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Function untuk menghilangkan URL
def remove_urls(text):
    return re.sub(pattern, '', text)

# Menggunakan function pada kolom 'full_text'
column_name = 'full_text'
df[column_name] = df[column_name].apply(remove_urls)


**Clear its/**

In [None]:
#Menghilangkan "its/" pada awalan text
df['full_text'] = df['full_text'].str.replace('its/', '')

#Test
first_10_rows = df.head(10)
print(first_10_rows)

**Make sure all word in alphabet**

In [None]:
import re
import pandas as pd

# Pola regex untuk identifikasi karakter non-alphabet
pattern = r'[^a-zA-Z\s]+'

# Function untuk menghapus karakter non-alphabet
def remove_non_alphabet(text):
    return re.sub(pattern, '', text)

# Mengubah value text menjadi string
df['full_text'] = df['full_text'].apply(lambda x: str(x) if not isinstance(x, str) else x)

# Menggunakan function pada kolom 'full_text'
df['full_text'] = df['full_text'].apply(remove_non_alphabet)

#Test
first_10_rows = df.head(10)
print(first_10_rows)

**Remove Punctuation**

In [None]:
import string

# Regex function untuk menghilangkan tanda baca
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Menggunakan function pada kolom 'full_text'
df['full_text'] = df['full_text'].apply(remove_punctuation)

#Test
first_10_rows = df.head(10)
print(first_10_rows)


**Remove Leading Space & Extra Whitespace**

In [None]:
# Menghilangkan leading space
df['full_text'] = df['full_text'].str.lstrip()

# Function untuk menghilangkan extra whitespace pada text
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

# Apply the function to the specified column.
df['full_text'] = df['full_text'].apply(remove_extra_spaces)

# Test
first_10_rows = df.head(10)
print(first_10_rows)

**Remove 1 word item**

In [None]:
# Function untuk menghilangkan kalimat yang hanya terdiri dari 1 kata
def remove_single_word(text):
    sentences = text.split('.')
    filtered_sentences = [sentence.strip() for sentence in sentences if len(sentence.split()) > 1]
    return '. '.join(filtered_sentences)

# Apply the function to the specified column.
df['full_text'] = df['full_text'].apply(remove_single_word)

# Test
first_10_rows = df.head(10)
print(first_10_rows)

**Remove additional stopword**

In [17]:
#Function untuk menghilangkan additional stopwords
def remove_additional_stopwords(text):
    # Additional stopwords yang akan dihilangkan
    words_to_remove = ["arek", "rek", "mas", "mba", "mbak", "rt", "ga", "g", "ada", "yg", "yang", "dong", "sby", "surabaya", "titipan", "cowo", "cewe", "ya", "y", "km", "rekk", "apa", "gasi"]

    # Regex pattern untuk menyesuaikan kalimat
    pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'
    return re.sub(pattern, '', text)

# Apply the function to the specified column.
df['full_text'] = df['full_text'].apply(remove_additional_stopwords)

**Remove Stopword & Tokenizing**

In [None]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import pandas as pd
from nltk.corpus import stopwords

nltk.download('stopwords')

def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

tokenized_text = 'full_text'
# Apply the function to the 'full_text' column
df['tokenized_text'] = df['full_text'].apply(tokenize_and_remove_stopwords)

print(df[['full_text', 'tokenized_text']])

**Remove Null Value in Column**

In [None]:
# Variable check null value
columns_to_check = ['full_text']

# Menghapus baris yang memiliki null value
df = df.dropna(subset=columns_to_check)

**Export**

In [None]:
# Export hasil cleansing (.csv)
output_file = 'twit_cleaned.csv'

# Menyimpan dataframe menjadi csv
df.to_csv(output_file, index=False)

print(f'DataFrame has been saved to {output_file}')

# Latent Dirichlet Allocation Model

**Import File**

In [None]:
#Import file local kedalam Google Colab
from google.colab import files
uploaded = files.upload()

Saving twit_cleaned.csv to twit_cleaned (1).csv


In [None]:
#Membaca file csv
import pandas as pd
df = pd.read_csv('twit_cleaned.csv')

text = df['full_text']

# Memastikan tidak ada value yang hilang
df['full_text'] = df['full_text'].fillna('')

# Mengubah teks menjadi list
text_list = [text.split() for text in df['full_text']]


first_10_rows = df.head(10)
print(first_10_rows)

**Creating Bigram & Trigram Model**

In [None]:
import gensim

# Membuat model bigram & trigram
from gensim.models import Phrases

# Menambahkan bigram dan trigram yang muncul <=10 ke dalam document
bigram = Phrases(text_list, min_count=10)
trigram = Phrases(bigram[text_list])
for idx in range(len(text_list)):
    # Memeriksa bigram dan trigram yang ditandai dengan identifier "_"
    for token in bigram[text_list[idx]]:
        if '_' in token:
            text_list[idx].append(token)
    for token in trigram[text_list[idx]]:
        if '_' in token:
            text_list[idx].append(token)

**Create Dictionary**

In [None]:
from gensim import corpora, models

# Membuat
dictionary = corpora.Dictionary(text_list)
dictionary.filter_extremes(no_below=5, no_above=0.2)

#no_below (int, optional) – Menyimpan token yang muncul sebanyak <="no_below" dalam document.
#no_above (float, optional) – Menyimpan token yang muncul sebanyak >="no_above" dalam document. (nilai pecahan dari ukuran total corpus, bukan angka absolut).
print(dictionary)

**Create Corpus**

In [None]:
#https://radimrehurek.com/gensim/tut1.html
# Build corpus
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_list]

# The function doc2bow converts document (a list of words) into the bag-of-words format
'''The function doc2bow() simply counts the number of occurrences of each distinct word,
converts the word to its integer word id and returns the result as a sparse vector.
The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “Human computer interaction”,
the words computer (id 0) and human (id 1) appear once;
the other ten dictionary words appear (implicitly) zero times.'''

print(len(doc_term_matrix))
print(doc_term_matrix[100])
tfidf = models.TfidfModel(doc_term_matrix) #build TF-IDF model
corpus_tfidf = tfidf[doc_term_matrix]

**Coherence Analysis**

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array

# Function untuk menghitung coherence value
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, iterations=100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

start=1
limit=21
step=1
model_list, coherence_values = compute_coherence_values(dictionary, corpus=corpus_tfidf,
                                                        texts=text_list, start=start, limit=limit, step=step)
# Visualisasi data
import matplotlib.pyplot as plt
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Print coherence value
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 6))

**Build Topic Model**

In [None]:
# Membangun model dengan jumlah topik berdasarkan hasil coherence score
model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20)
for idx, topic in model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

**Save top-word to CSV**

In [None]:
# Import top_word (Kata yang sering muncul dalam setiap topik) dalam bentuk CSV
import pandas as pd
top_words_per_topic = []
for t in range(model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in model.show_topic(t, topn = 10)])
df = pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word','P']).to_csv("top_words.csv")
print(df)

**Visualization**

In [None]:
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualisasi hasil topic modelling
data = gensimvis.prepare(model, corpus_tfidf, dictionary)
print(data)

pyLDAvis.display(data)
