## use each representation separately

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import string

In [4]:
df_emosi = pd.read_csv(r'src\emotion_cleaned.csv')

In [5]:
# Split the data
X = df_emosi['tweet']
y = df_emosi['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train

1769    kaget berita tetangga satu rt solo tahun tingg...
1220          tidak-enak sangat bada pulang kerjan banyak
44      iya ibu nya lahir anak cewek enggak tahu tahun...
289     jiyeeee jiyeee jeng dom habis menang lawan kei...
2486    cinta penuh banyak buat semua harga tak satu l...
                              ...                        
3444    sahabat perlu filosopi milik nilai hidup beri ...
466     banyak bilang pilih sopir mobil bener bawa mas...
3092    bilang tetap pegang janji nikah brhak hakim su...
3772    allahapa kok disalahin pres kayak presiden kur...
860     gue punya teman dibela-belain pinjem duwit kan...
Name: tweet, Length: 3520, dtype: object

In [7]:
# Bag of Words
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

In [8]:
print(X_train_bow)

  (0, 4810)	1
  (0, 1147)	1
  (0, 10739)	1
  (0, 9318)	1
  (0, 9102)	1
  (0, 10019)	1
  (0, 10380)	1
  (0, 10859)	1
  (0, 2700)	1
  (0, 11153)	1
  (0, 346)	1
  (0, 349)	1
  (0, 7638)	1
  (0, 1087)	2
  (0, 4210)	1
  (0, 5970)	1
  (0, 11245)	1
  (0, 4273)	1
  (0, 11411)	1
  (0, 4271)	1
  (0, 9066)	1
  (0, 9605)	1
  (0, 253)	1
  (0, 3033)	1
  (1, 10821)	1
  :	:
  (3518, 2483)	1
  (3518, 3202)	1
  (3518, 10910)	1
  (3518, 299)	1
  (3518, 8549)	1
  (3518, 3138)	1
  (3518, 4641)	1
  (3519, 1504)	1
  (3519, 3607)	1
  (3519, 10605)	1
  (3519, 10805)	1
  (3519, 8681)	1
  (3519, 10477)	1
  (3519, 9321)	1
  (3519, 4914)	1
  (3519, 8358)	2
  (3519, 11337)	1
  (3519, 5188)	1
  (3519, 7640)	1
  (3519, 470)	1
  (3519, 7951)	1
  (3519, 3390)	1
  (3519, 2290)	1
  (3519, 1050)	1
  (3519, 2713)	1


In [9]:
# TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [10]:
print(X_train_tfidf)

  (0, 3033)	0.22954952181025595
  (0, 253)	0.1820641846385591
  (0, 9605)	0.2500003433823169
  (0, 9066)	0.23803737965372196
  (0, 4271)	0.2500003433823169
  (0, 11411)	0.1721368072098088
  (0, 4273)	0.22954952181025595
  (0, 11245)	0.1672891608802246
  (0, 5970)	0.10814039858178676
  (0, 4210)	0.17432331096844592
  (0, 1087)	0.35588121304178577
  (0, 7638)	0.2130384503539307
  (0, 349)	0.1721368072098088
  (0, 346)	0.2500003433823169
  (0, 11153)	0.2500003433823169
  (0, 2700)	0.14427112963688324
  (0, 10859)	0.13868767712190488
  (0, 10380)	0.1268379737967491
  (0, 10019)	0.1997029461808805
  (0, 9102)	0.20251500621062005
  (0, 9318)	0.12241984116408033
  (0, 10739)	0.18864787866613406
  (0, 1147)	0.18357755732589456
  (0, 4810)	0.17925212460881954
  (1, 895)	0.22239438583203394
  :	:
  (3518, 1532)	0.15131114175447472
  (3518, 8554)	0.2080624342218105
  (3518, 5214)	0.1677000294324275
  (3518, 5428)	0.15604174579959643
  (3518, 495)	0.12332068707344476
  (3518, 1499)	0.2031208163913

In [11]:
# N-grams (Unigram and Bigram)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = vectorizer_ngram.fit_transform(X_train)
X_test_ngram = vectorizer_ngram.transform(X_test)

In [12]:
print(X_train_ngram)

  (0, 27918)	1
  (0, 8289)	1
  (0, 64020)	1
  (0, 54790)	1
  (0, 52951)	1
  (0, 59578)	1
  (0, 61327)	1
  (0, 64775)	1
  (0, 16602)	1
  (0, 66347)	1
  (0, 2330)	1
  (0, 2338)	1
  (0, 45157)	1
  (0, 7900)	2
  (0, 24271)	1
  (0, 35879)	1
  (0, 66768)	1
  (0, 24631)	1
  (0, 67551)	1
  (0, 24627)	1
  (0, 52845)	1
  (0, 57176)	1
  (0, 1787)	1
  (0, 18380)	1
  (0, 27922)	1
  :	:
  (3519, 20950)	1
  (3519, 31517)	1
  (3519, 45166)	1
  (3519, 45167)	1
  (3519, 3030)	1
  (3519, 47287)	1
  (3519, 19734)	1
  (3519, 14867)	1
  (3519, 7467)	1
  (3519, 16691)	1
  (3519, 63181)	1
  (3519, 14868)	1
  (3519, 7468)	1
  (3519, 49755)	1
  (3519, 16692)	1
  (3519, 29570)	1
  (3519, 10370)	1
  (3519, 64344)	1
  (3519, 49757)	1
  (3519, 67272)	1
  (3519, 19735)	1
  (3519, 3034)	1
  (3519, 31519)	1
  (3519, 47290)	1
  (3519, 62219)	1


In [13]:
# Naive Bayes
nb = MultinomialNB()

# Bag of Words
nb.fit(X_train_bow, y_train)
y_pred_bow_nb = nb.predict(X_test_bow)
print("Naive Bayes with Bag of Words")
print(classification_report(y_test, y_pred_bow_nb))

# TF-IDF
nb.fit(X_train_tfidf, y_train)
y_pred_tfidf_nb = nb.predict(X_test_tfidf)
print("Naive Bayes with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_nb))

# N-grams
nb.fit(X_train_ngram, y_train)
y_pred_ngram_nb = nb.predict(X_test_ngram)
print("Naive Bayes with N-grams")
print(classification_report(y_test, y_pred_ngram_nb))

Naive Bayes with Bag of Words
              precision    recall  f1-score   support

       anger       0.71      0.79      0.74       229
        fear       0.72      0.64      0.68       119
       happy       0.73      0.62      0.67       214
        love       0.74      0.73      0.73       119
     sadness       0.53      0.58      0.56       200

    accuracy                           0.67       881
   macro avg       0.68      0.67      0.68       881
weighted avg       0.68      0.67      0.67       881

Naive Bayes with TF-IDF
              precision    recall  f1-score   support

       anger       0.61      0.84      0.71       229
        fear       0.97      0.29      0.44       119
       happy       0.71      0.62      0.66       214
        love       0.87      0.40      0.55       119
     sadness       0.44      0.65      0.53       200

    accuracy                           0.61       881
   macro avg       0.72      0.56      0.58       881
weighted avg       0.68

In [14]:
# Random Forest
rf = RandomForestClassifier()

# Bag of Words
rf.fit(X_train_bow, y_train)
y_pred_bow_rf = rf.predict(X_test_bow)
print("Random Forest with Bag of Words")
print(classification_report(y_test, y_pred_bow_rf))

# TF-IDF
rf.fit(X_train_tfidf, y_train)
y_pred_tfidf_rf = rf.predict(X_test_tfidf)
print("Random Forest with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_rf))

# N-grams
rf.fit(X_train_ngram, y_train)
y_pred_ngram_rf = rf.predict(X_test_ngram)
print("Random Forest with N-grams")
print(classification_report(y_test, y_pred_ngram_rf))

Random Forest with Bag of Words
              precision    recall  f1-score   support

       anger       0.54      0.74      0.62       229
        fear       0.84      0.62      0.71       119
       happy       0.69      0.61      0.65       214
        love       0.66      0.79      0.72       119
     sadness       0.54      0.40      0.46       200

    accuracy                           0.62       881
   macro avg       0.65      0.63      0.63       881
weighted avg       0.63      0.62      0.62       881

Random Forest with TF-IDF
              precision    recall  f1-score   support

       anger       0.54      0.76      0.63       229
        fear       0.84      0.60      0.70       119
       happy       0.69      0.56      0.62       214
        love       0.68      0.81      0.74       119
     sadness       0.52      0.41      0.45       200

    accuracy                           0.62       881
   macro avg       0.65      0.63      0.63       881
weighted avg       

In [15]:
# Support Vector Machine
svm = SVC()

# Bag of Words
svm.fit(X_train_bow, y_train)
y_pred_bow_svm = svm.predict(X_test_bow)
print("SVM with Bag of Words")
print(classification_report(y_test, y_pred_bow_svm))

# TF-IDF
svm.fit(X_train_tfidf, y_train)
y_pred_tfidf_svm = svm.predict(X_test_tfidf)
print("SVM with TF-IDF")
print(classification_report(y_test, y_pred_tfidf_svm))

# N-grams
svm.fit(X_train_ngram, y_train)
y_pred_ngram_svm = svm.predict(X_test_ngram)
print("SVM with N-grams")
print(classification_report(y_test, y_pred_ngram_svm))

SVM with Bag of Words
              precision    recall  f1-score   support

       anger       0.58      0.77      0.67       229
        fear       0.87      0.51      0.65       119
       happy       0.62      0.63      0.62       214
        love       0.75      0.72      0.74       119
     sadness       0.50      0.43      0.46       200

    accuracy                           0.62       881
   macro avg       0.66      0.61      0.63       881
weighted avg       0.63      0.62      0.62       881

SVM with TF-IDF
              precision    recall  f1-score   support

       anger       0.62      0.81      0.70       229
        fear       0.88      0.55      0.67       119
       happy       0.66      0.65      0.66       214
        love       0.79      0.70      0.74       119
     sadness       0.53      0.51      0.52       200

    accuracy                           0.65       881
   macro avg       0.70      0.64      0.66       881
weighted avg       0.67      0.65      

## use three representation in one model

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

# Define the feature extraction steps
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))

# Combine the features using FeatureUnion
combined_features = FeatureUnion([
    ("bow", vectorizer_bow),
    ("tfidf", vectorizer_tfidf),
    ("ngram", vectorizer_ngram)
])

# Create a pipeline that first transforms the data and then applies the model
pipeline = Pipeline([
    ("features", combined_features),
    ("classifier", MultinomialNB())  # You can replace MultinomialNB with any other classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("MultinomialNB with Combined Features")
print(classification_report(y_test, y_pred))

MultinomialNB with Combined Features
              precision    recall  f1-score   support

       anger       0.67      0.82      0.73       229
        fear       0.80      0.62      0.70       119
       happy       0.71      0.62      0.66       214
        love       0.77      0.69      0.73       119
     sadness       0.53      0.58      0.56       200

    accuracy                           0.67       881
   macro avg       0.70      0.66      0.68       881
weighted avg       0.68      0.67      0.67       881



In [None]:
pipeline_rf = Pipeline([
    ("features", combined_features),
    ("classifier", RandomForestClassifier())
])

# Train the model
pipeline_rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = pipeline_rf.predict(X_test)
print("Random Forest with Combined Features")
print(classification_report(y_test, y_pred_rf))

Random Forest with Combined Features
              precision    recall  f1-score   support

       anger       0.52      0.79      0.62       229
        fear       0.86      0.63      0.73       119
       happy       0.69      0.55      0.61       214
        love       0.67      0.81      0.73       119
     sadness       0.55      0.36      0.43       200

    accuracy                           0.61       881
   macro avg       0.66      0.63      0.63       881
weighted avg       0.63      0.61      0.61       881



In [None]:
pipeline_svm = Pipeline([
    ("features", combined_features),
    ("classifier", SVC())
])

# Train the model
pipeline_svm.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = pipeline_svm.predict(X_test)
print("SVM with Combined Features")
print(classification_report(y_test, y_pred_svm))

SVM with Combined Features
              precision    recall  f1-score   support

       anger       0.58      0.75      0.66       229
        fear       0.86      0.50      0.63       119
       happy       0.60      0.63      0.62       214
        love       0.75      0.75      0.75       119
     sadness       0.49      0.42      0.45       200

    accuracy                           0.61       881
   macro avg       0.65      0.61      0.62       881
weighted avg       0.62      0.61      0.61       881

