This project is classification of tweet emotion. There was 5 labels.
1. Anger 
2. Happy
3. Sadness
4. Fear
5. Love

I used all rows of this dataset (4400)

Algorithm that I used is Automatic MLP using library mlpclassifier. And I get 64% accuracy

Step by step of this project :
1. Import data
2. Stemming and Stopwords using library Sastrawi
3. Count Vectorized
4. Define Target Label
5. Split Data for Train dan Test
6. Classification Process
7. Classification Report
8. Try to predict a sentence

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Twitter_Emotion_Dataset.csv")
data.head()

Unnamed: 0,label,tweet
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata..."


In [3]:
len(data)

4401

In [4]:
#Import Library Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


#Create array for stemming and stopwords process
stem_arr = []
stop_arr = []

for i in range(len(data)):
    
    #stemming
    factory1 = StemmerFactory()
    stemmer = factory1.create_stemmer()
    katadasar = stemmer.stem(str(data['tweet'][i]))
    stem_arr.append(katadasar)
    
    #stopwords
    factory2 = StopWordRemoverFactory()
    stopword = factory2.create_stop_word_remover()
    stop = stopword.remove(katadasar)
    stop_arr.append(stop)

In [5]:
#show result of stemming
print ("\t\t\t\t\t==== HASIL STEMMING ====\n")
print (stem_arr)

					==== HASIL STEMMING ====



In [6]:
#show result of stopwords process
print ("\t\t\t\t\t==== HASIL STOPWORDS ====\n")
print (stop_arr)

					==== HASIL STOPWORDS ====



In [7]:
#look the data and count of each label
sentiment_counts = data.label.value_counts()
print("Jumlah data :", len(data),"\n")
print(sentiment_counts)

Jumlah data : 4401 

anger      1101
happy      1017
sadness     997
fear        649
love        637
Name: label, dtype: int64


In [8]:
#vectorized 

import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,2)) # di parsing jadi 1 kalimat
vectorized_data = count_vectorizer.fit_transform(stop_arr) #transformasi jadi vektor
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

def sentiment2target(sentiment):
    return {
        'anger': 0, 'happy': 1, 'sadness': 2, 'fear': 3, 'love': 4
    }[sentiment]

target = data.label.apply(sentiment2target)
print(target)

0       0
1       0
2       1
3       0
4       1
5       0
6       1
7       2
8       0
9       2
10      0
11      0
12      1
13      2
14      0
15      0
16      4
17      0
18      1
19      0
20      0
21      2
22      1
23      2
24      4
25      0
26      1
27      0
28      0
29      3
       ..
4371    3
4372    1
4373    3
4374    4
4375    0
4376    2
4377    1
4378    0
4379    1
4380    3
4381    2
4382    2
4383    4
4384    1
4385    4
4386    4
4387    3
4388    4
4389    0
4390    4
4391    3
4392    0
4393    2
4394    2
4395    2
4396    4
4397    3
4398    0
4399    1
4400    2
Name: label, Length: 4401, dtype: int64


In [10]:
#split data (75% for train, 25% for test)

from sklearn.model_selection import train_test_split

data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, target, test_size=0.25, random_state=0)

data_train_index = data_train[:,0] 
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [11]:
print("JUMLAH DATA TRAIN :",len(targets_train))
print("JUMLAH DATA TEST :",len(targets_test))

JUMLAH DATA TRAIN : 3300
JUMLAH DATA TEST : 1101


# CLASSIFICATION USING MLP

In [12]:
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

models = {
    "mlp": MLPClassifier(),
}

In [13]:
#create model

model = models["mlp"]
model.fit(data_train, targets_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [14]:
#evaluation predictions

predictions = model.predict(data_test)
print ("Evaluating..")
print(classification_report(targets_test, predictions))

Evaluating..
              precision    recall  f1-score   support

           0       0.67      0.72      0.70       292
           1       0.62      0.64      0.63       243
           2       0.51      0.50      0.51       261
           3       0.74      0.62      0.68       148
           4       0.76      0.76      0.76       157

    accuracy                           0.64      1101
   macro avg       0.66      0.65      0.65      1101
weighted avg       0.64      0.64      0.64      1101



# PREDICTION OF SENTENCES CLASSIFICATION

In [15]:
#this sentence is example only

sentences = count_vectorizer.transform(["wah parah, kecewa banget sama kepemimpinan Pak Anies"])
result = model.predict(sentences)

if result==0:
    print("Anger")
elif result==1:
    print("Happy")
elif result==2:
    print("Sadness")
elif result==3:
    print("Fear")
elif result==4:
    print("Love")
else:
    print("Tidak Terdeteksi")

Anger
