In [11]:
import pandas as pd
import warnings
import re
import matplotlib.pyplot as plt

In [12]:
df_emosi=pd.read_csv(r'src\Twitter_Emotion_Dataset.csv')

In [13]:
df_pemilu=pd.read_csv(r'src\pemilu-2024.csv', on_bad_lines='skip')

### Missval Check

In [14]:
df_emosi.isna().sum()

label    0
tweet    0
dtype: int64

### Masking

In [15]:
# Fungsi untuk menghitung kata yang dimask dalam sebuah teks
def count_masked_words(text):
    if isinstance(text, str):
        pattern = r'\[USERNAME\]|\[USER\]'  # Pola regex untuk mencari [USERNAME] atau [USER]
        matches = re.findall(pattern, text)
        return len(matches)
    else:
        return 0  # Jika nilai tidak bertipe string, kembalikan 0

# Menerapkan fungsi ke kolom teks dalam DataFrame
df_emosi['masked_word_count'] = df_emosi['tweet'].apply(count_masked_words)

# Jumlah kata yang dimask dalam keseluruhan dataset
total_masked_words = df_emosi['masked_word_count'].sum()

print("Total kata yang dimask dalam dataset:", total_masked_words)

Total kata yang dimask dalam dataset: 1793


In [16]:
# Fungsi untuk menemukan dan menghitung pola yang diawali dengan tanda kurung siku dalam sebuah teks
def find_and_count_patterns(text):
    if isinstance(text, str):
        pattern = r'\[([^]]+)\]'  # Pola regex untuk mencari semua pola yang diawali dengan tanda kurung siku
        matches = re.findall(pattern, text)

        # Menghitung jumlah kemunculan setiap pola
        pattern_counts = {}
        for match in matches:
            if match in pattern_counts:
                pattern_counts[match] += 1
            else:
                pattern_counts[match] = 1

        return pattern_counts
    else:
        return {}  # Jika nilai tidak bertipe string, kembalikan dictionary kosong

# Menerapkan fungsi ke kolom teks dalam DataFrame
df_emosi['pattern_counts'] = df_emosi['tweet'].apply(find_and_count_patterns)

# Menggabungkan hasil dari semua tweet menjadi satu dictionary
all_patterns_counts = {}
for pattern_count in df_emosi['pattern_counts']:
    for pattern, count in pattern_count.items():
        if pattern in all_patterns_counts:
            all_patterns_counts[pattern] += count
        else:
            all_patterns_counts[pattern] = count

In [17]:
# Membuat barplot
all_patterns_counts = sorted(all_patterns_counts.items(), key=lambda x:x[1], reverse=True)
all_patterns_counts = dict(all_patterns_counts)
for k, i in all_patterns_counts.items():
  print(k, i)

USERNAME 1793
URL 621
askmf 10
SENSITIVE-NO 5
askMF 4
Idm 2
AskMF 2
C48 2
idm 1
Seo In Ha, Love Rain 1
Askmf 1
Late Post 1
Allamah Thabathabai 1
Kartu 1 pria thd wanita 1
Habis buka Facebook 1
Thinking.. 1
BELAJARLAH DEMI ORANGTUAMU!!! 1
BB 1
Obrolan dengan dospem 1 & 2 di grup WA menjelang besok sidang 1
Satu menit kemudian 1
1 1


In [18]:
df_emosi.tail()

Unnamed: 0,label,tweet,masked_word_count,pattern_counts
4396,love,"Tahukah kamu, bahwa saat itu papa memejamkan m...",0,{}
4397,fear,Sulitnya menetapkan Calon Wapresnya Jokowi di ...,0,{}
4398,anger,"5. masa depannya nggak jelas. lha iya, gimana ...",0,{}
4399,happy,[USERNAME] dulu beneran ada mahasiswa Teknik U...,1,{'USERNAME': 1}
4400,sadness,"Ya Allah, hanya Engkau yang mengetahui rasa sa...",0,{}


### Slang dan Abreviasi

In [19]:
kamus_slang=pd.read_csv(r'src\colloquial-indonesian-lexicon.csv')
kamus_slang=kamus_slang.rename(columns = {'slang' : 'kamus_slang' , 'formal' : 'kamus_perbaikan'})

# Rekonstruksi data sebagai 'dict'
slang_mapping = dict(zip(kamus_slang['kamus_slang'], kamus_slang['kamus_perbaikan']))
kamus_singkatan = pd.read_csv(r'src\kamus_singkatan.csv', header=None, names=['sebelum_perbaikan', 'setelah_perbaikan'],delimiter=';')
singkatan_mapping=dict(zip(kamus_singkatan['sebelum_perbaikan'],kamus_singkatan['setelah_perbaikan']))

### Stopword, emoji, dan Stemmer Factory

In [22]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import  StopWordRemoverFactory
import emoji
from spacy.lang.id import Indonesian
import string

In [23]:
stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()
# List of words with negation meaning
emoji_data = emoji.EMOJI_DATA

# Remove negation words from stopwords
# stopwords = set(stopwords).difference(excluded_stopwords)
nlp = Indonesian()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [24]:
def replace_emoji_with_ascii(text, emoji_data, language='id'):
    for emoji, translations in emoji_data.items():
        if language in translations:
            text = text.replace(emoji, translations[language])
    return text

In [25]:
text_with_emoji = "Saat kamu merenungkan tentang kehilangan yang pernah kamu alami, luka-luka itu terasa kembali dalam ingatan. 💔🌼 #RememberingLoss"
a = replace_emoji_with_ascii(text_with_emoji, emoji_data, language='id')
a = a.replace(":",' ').replace('_','mask').replace('-','rus').strip()
a = re.sub(' +', ' ', a)
print(a)

Saat kamu merenungkan tentang kehilangan yang pernah kamu alami, lukarusluka itu terasa kembali dalam ingatan. patahmaskhati mekar #RememberingLoss


In [26]:
def process_tweet(tweet) :
  tweet=tweet.lower()
  # link
  tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)

  # spesifik
  tweet = re.sub(r'\[username\]|\[url\]|\[askmf\]|\[sensitive-no\]|\[satu menit kemudian\]|\[c48\]|\[idm\]', '', tweet)

  # emoji
  tweet=replace_emoji_with_ascii(tweet,emoji_data)
  tweet=tweet.replace(":",' ').replace('_','mask').replace('-','rus').strip()
  tweet=re.sub(' +', ' ', tweet)

  # tokenisasi
  tokens = tweet.split()

  tweet_tokens = []
  for ele in tokens:
    ele_kamus = kamus_singkatan.get(ele, ele)
    ele_slang = slang_mapping.get(ele_kamus, ele_kamus)
    tweet_tokens.append(ele_slang)

  tweet = ' '.join(tweet_tokens)
  tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
  tweet = re.sub(r'#([^\s]+)', '', tweet)
  tweet=re.sub(r'\d+', '', tweet)
  tweet = tweet.strip('\'"')
  tweet = tweet.lstrip('\'"')

  tweet = "".join([char for char in tweet if char not in string.punctuation])

  doc = nlp(tweet)

  tokens = [token.text for token in doc]
      # Hapus stopwords dari tokens
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  tweet = ' '.join(filtered_tokens)

  tweet=stemmer.stem(tweet)
  tweet=tweet.replace('mask',' ').replace('rus','-')

  return tweet

In [27]:
print(process_tweet('hai sayangnya adalah 😂'))

hai sayang wajah gembira berurai air mata


In [28]:
print(process_tweet(str(df_emosi['tweet'][0])))

soal jalan jatibarupolisi gertak gubernur emangny polisi ikut pmbhasan jangan politik atur wilayahhak gubernur soal tn abang soal turun temurunpelikperlu sabar


In [29]:
df_emosi.head(5)

Unnamed: 0,label,tweet,masked_word_count,pattern_counts
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",2,"{'USERNAME': 2, 'URL': 1}"
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",0,{}
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,0,{}
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",0,{'URL': 1}
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...",1,{'USERNAME': 1}


In [31]:
df_emosi['tweet'] = df_emosi['tweet'].apply(lambda x: process_tweet(str(x)))

In [32]:
df_emosi.to_csv(r'src\cleaned.csv')

In [33]:
df_emosi['tweet']

0       soal jalan jatibarupolisi gertak gubernur eman...
1       sama cewek lho kayak ha- lebih rasai lah sibuk...
2       pengin gudeg mbarek bu hj foto google sengaja ...
3       jalan jatibarubagian wilayah tn abangpengatura...
4       sharing alam aja kemarin jam batalin tiket sta...
                              ...                        
4396    tahu kamu papa mejam mata tahan gejolak batin ...
4397    sulit tetap calon wapresnya jokowi pilpres sal...
4398    masa depan enggak jelas lah iya bagaimana mau ...
4399    dulu benar mahasiswa teknik ui tembak pacar pa...
4400       allah engkau tahu rasa sakit hati sembuh allah
Name: tweet, Length: 4401, dtype: object

## use each representation separately

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import string

In [35]:
# Split the data
X = df_emosi['tweet']
y = df_emosi['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
X_train

1769    kaget berita tetangga satu rt solo tahun tingg...
1220          tidak-enak sangat bada pulang kerjan banyak
44      iya ibu nya lahir anak cewek enggak tahu tahun...
289     jiyeeee jiyeee jeng dom habis menang lawan kei...
2486    cinta penuh banyak buat semua harga tak satu l...
                              ...                        
3444    sahabat perlu filosopi milik nilai hidup beri ...
466     banyak bilang pilih sopir mobil bener bawa mas...
3092    bilang tetap pegang janji nikah brhak hakim su...
3772    allahapa kok disalahin pres kayak presiden kur...
860     gue punya teman dibela-belain pinjem duwit kan...
Name: tweet, Length: 3520, dtype: object

In [37]:
# Bag of Words
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

In [40]:
print(X_train_bow)

  (0, 4810)	1
  (0, 1147)	1
  (0, 10739)	1
  (0, 9318)	1
  (0, 9102)	1
  (0, 10019)	1
  (0, 10380)	1
  (0, 10859)	1
  (0, 2700)	1
  (0, 11153)	1
  (0, 346)	1
  (0, 349)	1
  (0, 7638)	1
  (0, 1087)	2
  (0, 4210)	1
  (0, 5970)	1
  (0, 11245)	1
  (0, 4273)	1
  (0, 11411)	1
  (0, 4271)	1
  (0, 9066)	1
  (0, 9605)	1
  (0, 253)	1
  (0, 3033)	1
  (1, 10821)	1
  :	:
  (3518, 2483)	1
  (3518, 3202)	1
  (3518, 10910)	1
  (3518, 299)	1
  (3518, 8549)	1
  (3518, 3138)	1
  (3518, 4641)	1
  (3519, 1504)	1
  (3519, 3607)	1
  (3519, 10605)	1
  (3519, 10805)	1
  (3519, 8681)	1
  (3519, 10477)	1
  (3519, 9321)	1
  (3519, 4914)	1
  (3519, 8358)	2
  (3519, 11337)	1
  (3519, 5188)	1
  (3519, 7640)	1
  (3519, 470)	1
  (3519, 7951)	1
  (3519, 3390)	1
  (3519, 2290)	1
  (3519, 1050)	1
  (3519, 2713)	1


In [41]:
# TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [43]:
print(X_train_tfidf)

  (0, 3033)	0.22954952181025595
  (0, 253)	0.1820641846385591
  (0, 9605)	0.2500003433823169
  (0, 9066)	0.23803737965372196
  (0, 4271)	0.2500003433823169
  (0, 11411)	0.1721368072098088
  (0, 4273)	0.22954952181025595
  (0, 11245)	0.1672891608802246
  (0, 5970)	0.10814039858178676
  (0, 4210)	0.17432331096844592
  (0, 1087)	0.35588121304178577
  (0, 7638)	0.2130384503539307
  (0, 349)	0.1721368072098088
  (0, 346)	0.2500003433823169
  (0, 11153)	0.2500003433823169
  (0, 2700)	0.14427112963688324
  (0, 10859)	0.13868767712190488
  (0, 10380)	0.1268379737967491
  (0, 10019)	0.1997029461808805
  (0, 9102)	0.20251500621062005
  (0, 9318)	0.12241984116408033
  (0, 10739)	0.18864787866613406
  (0, 1147)	0.18357755732589456
  (0, 4810)	0.17925212460881954
  (1, 895)	0.22239438583203394
  :	:
  (3518, 1532)	0.15131114175447472
  (3518, 8554)	0.2080624342218105
  (3518, 5214)	0.1677000294324275
  (3518, 5428)	0.15604174579959643
  (3518, 495)	0.12332068707344476
  (3518, 1499)	0.2031208163913

In [44]:
# N-grams (Unigram and Bigram)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = vectorizer_ngram.fit_transform(X_train)
X_test_ngram = vectorizer_ngram.transform(X_test)

In [46]:
print(X_train_ngram)

  (0, 27918)	1
  (0, 8289)	1
  (0, 64020)	1
  (0, 54790)	1
  (0, 52951)	1
  (0, 59578)	1
  (0, 61327)	1
  (0, 64775)	1
  (0, 16602)	1
  (0, 66347)	1
  (0, 2330)	1
  (0, 2338)	1
  (0, 45157)	1
  (0, 7900)	2
  (0, 24271)	1
  (0, 35879)	1
  (0, 66768)	1
  (0, 24631)	1
  (0, 67551)	1
  (0, 24627)	1
  (0, 52845)	1
  (0, 57176)	1
  (0, 1787)	1
  (0, 18380)	1
  (0, 27922)	1
  :	:
  (3519, 20950)	1
  (3519, 31517)	1
  (3519, 45166)	1
  (3519, 45167)	1
  (3519, 3030)	1
  (3519, 47287)	1
  (3519, 19734)	1
  (3519, 14867)	1
  (3519, 7467)	1
  (3519, 16691)	1
  (3519, 63181)	1
  (3519, 14868)	1
  (3519, 7468)	1
  (3519, 49755)	1
  (3519, 16692)	1
  (3519, 29570)	1
  (3519, 10370)	1
  (3519, 64344)	1
  (3519, 49757)	1
  (3519, 67272)	1
  (3519, 19735)	1
  (3519, 3034)	1
  (3519, 31519)	1
  (3519, 47290)	1
  (3519, 62219)	1


In [47]:
# Naive Bayes
nb = MultinomialNB()

# Bag of Words
nb.fit(X_train_bow, y_train)
y_pred_bow_nb = nb.predict(X_test_bow)
print("Naive Bayes with Bag of Words")
print(classification_report(y_test, y_pred_bow_nb))

# TF-IDF
nb.fit(X_train_tfidf, y_train)
y_pred_tfidf_nb = nb.predict(X_test_tfidf)
print("Naive Bayes with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_nb))

# N-grams
nb.fit(X_train_ngram, y_train)
y_pred_ngram_nb = nb.predict(X_test_ngram)
print("Naive Bayes with N-grams")
print(classification_report(y_test, y_pred_ngram_nb))

Naive Bayes with Bag of Words
              precision    recall  f1-score   support

       anger       0.71      0.79      0.74       229
        fear       0.72      0.64      0.68       119
       happy       0.73      0.62      0.67       214
        love       0.74      0.73      0.73       119
     sadness       0.53      0.58      0.56       200

    accuracy                           0.67       881
   macro avg       0.68      0.67      0.68       881
weighted avg       0.68      0.67      0.67       881

Naive Bayes with TF-IDF
              precision    recall  f1-score   support

       anger       0.61      0.84      0.71       229
        fear       0.97      0.29      0.44       119
       happy       0.71      0.62      0.66       214
        love       0.87      0.40      0.55       119
     sadness       0.44      0.65      0.53       200

    accuracy                           0.61       881
   macro avg       0.72      0.56      0.58       881
weighted avg       0.68

In [48]:
# Random Forest
rf = RandomForestClassifier()

# Bag of Words
rf.fit(X_train_bow, y_train)
y_pred_bow_rf = rf.predict(X_test_bow)
print("Random Forest with Bag of Words")
print(classification_report(y_test, y_pred_bow_rf))

# TF-IDF
rf.fit(X_train_tfidf, y_train)
y_pred_tfidf_rf = rf.predict(X_test_tfidf)
print("Random Forest with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_rf))

# N-grams
rf.fit(X_train_ngram, y_train)
y_pred_ngram_rf = rf.predict(X_test_ngram)
print("Random Forest with N-grams")
print(classification_report(y_test, y_pred_ngram_rf))

Random Forest with Bag of Words
              precision    recall  f1-score   support

       anger       0.54      0.76      0.64       229
        fear       0.86      0.63      0.73       119
       happy       0.71      0.57      0.63       214
        love       0.66      0.82      0.73       119
     sadness       0.53      0.41      0.46       200

    accuracy                           0.62       881
   macro avg       0.66      0.64      0.64       881
weighted avg       0.64      0.62      0.62       881

Random Forest with TF-IDF
              precision    recall  f1-score   support

       anger       0.56      0.76      0.64       229
        fear       0.83      0.61      0.71       119
       happy       0.72      0.57      0.63       214
        love       0.67      0.79      0.72       119
     sadness       0.49      0.41      0.45       200

    accuracy                           0.62       881
   macro avg       0.65      0.63      0.63       881
weighted avg       

In [49]:
# Support Vector Machine
svm = SVC()

# Bag of Words
svm.fit(X_train_bow, y_train)
y_pred_bow_svm = svm.predict(X_test_bow)
print("SVM with Bag of Words")
print(classification_report(y_test, y_pred_bow_svm))

# TF-IDF
svm.fit(X_train_tfidf, y_train)
y_pred_tfidf_svm = svm.predict(X_test_tfidf)
print("SVM with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_svm))

# N-grams
svm.fit(X_train_ngram, y_train)
y_pred_ngram_svm = svm.predict(X_test_ngram)
print("SVM with N-grams")
print(classification_report(y_test, y_pred_ngram_svm))

SVM with Bag of Words
              precision    recall  f1-score   support

       anger       0.58      0.77      0.67       229
        fear       0.87      0.51      0.65       119
       happy       0.62      0.63      0.62       214
        love       0.75      0.72      0.74       119
     sadness       0.50      0.43      0.46       200

    accuracy                           0.62       881
   macro avg       0.66      0.61      0.63       881
weighted avg       0.63      0.62      0.62       881

SVM with TF-IDF
              precision    recall  f1-score   support

       anger       0.62      0.81      0.70       229
        fear       0.88      0.55      0.67       119
       happy       0.66      0.65      0.66       214
        love       0.79      0.70      0.74       119
     sadness       0.53      0.51      0.52       200

    accuracy                           0.65       881
   macro avg       0.70      0.64      0.66       881
weighted avg       0.67      0.65      

## use three representation in one model

In [51]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

# Define the feature extraction steps
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))

# Combine the features using FeatureUnion
combined_features = FeatureUnion([
    ("bow", vectorizer_bow),
    ("tfidf", vectorizer_tfidf),
    ("ngram", vectorizer_ngram)
])

# Create a pipeline that first transforms the data and then applies the model
pipeline = Pipeline([
    ("features", combined_features),
    ("classifier", MultinomialNB())  # You can replace MultinomialNB with any other classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("MultinomialNB with Combined Features")
print(classification_report(y_test, y_pred))

MultinomialNB with Combined Features
              precision    recall  f1-score   support

       anger       0.67      0.82      0.73       229
        fear       0.80      0.62      0.70       119
       happy       0.71      0.62      0.66       214
        love       0.77      0.69      0.73       119
     sadness       0.53      0.58      0.56       200

    accuracy                           0.67       881
   macro avg       0.70      0.66      0.68       881
weighted avg       0.68      0.67      0.67       881



In [52]:
pipeline_rf = Pipeline([
    ("features", combined_features),
    ("classifier", RandomForestClassifier())
])

# Train the model
pipeline_rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = pipeline_rf.predict(X_test)
print("Random Forest with Combined Features")
print(classification_report(y_test, y_pred_rf))

Random Forest with Combined Features
              precision    recall  f1-score   support

       anger       0.52      0.79      0.62       229
        fear       0.86      0.63      0.73       119
       happy       0.69      0.55      0.61       214
        love       0.67      0.81      0.73       119
     sadness       0.55      0.36      0.43       200

    accuracy                           0.61       881
   macro avg       0.66      0.63      0.63       881
weighted avg       0.63      0.61      0.61       881



In [53]:
pipeline_svm = Pipeline([
    ("features", combined_features),
    ("classifier", SVC())
])

# Train the model
pipeline_svm.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = pipeline_svm.predict(X_test)
print("SVM with Combined Features")
print(classification_report(y_test, y_pred_svm))

SVM with Combined Features
              precision    recall  f1-score   support

       anger       0.58      0.75      0.66       229
        fear       0.86      0.50      0.63       119
       happy       0.60      0.63      0.62       214
        love       0.75      0.75      0.75       119
     sadness       0.49      0.42      0.45       200

    accuracy                           0.61       881
   macro avg       0.65      0.61      0.62       881
weighted avg       0.62      0.61      0.61       881

