In [87]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split;
from sklearn.metrics import classification_report

In [2]:
import string
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
df = pd.read_csv('duygular.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,emotion
0,0,"Sana duyduğum arzu, kalbimin en derin köşeleri...",Arzu
1,1,"Gözlerin, içimde tarifsiz bir özlem ve tutku u...",Arzu
2,2,"Sana dokunma arzusu, tenimi yakıp kavuruyor.",Arzu
3,3,"Varlığın, içimde bastırılmaz bir istek yaratıyor.",Arzu
4,4,"Seninle olma düşüncesi, beni hayallere sürüklü...",Arzu


In [8]:
df['emotion'].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
Üzüntü,3161
Suçluluk,2632
Sarkazm,2577
İğrenme,2531
Nötr,2482
Korku,2455
Mutluluk,2417
Kafa Karışıklığı,2293
Arzu,2142
Şaşkınlık,2082


All classes are distributed fair. We do not need to oversample that data

In [9]:
df.shape

(29727, 3)

We are working on a dataset which is around 30k lines which means data is enough for training and modelling

In [12]:
df.isna().sum().sum()

np.int64(0)

We do not have any null values that we need to evaluate

In [14]:
df = df[['emotion', 'text']]

We are not working on columns Unnamed 0. Thus I removed it

# Defining Stop Words

In [32]:
stop_words = stopwords.words('turkish')

# Text Cleaning

In [51]:
def cleaned_text(text):
  text = text.lower() # making text lowercase
  text = re.sub(r'\d+', "", text) # removing numbers
  text = re.sub(r'[^\w\s]', "", text) # remocing other characters except words and spaces
  text = BeautifulSoup(text, "html.parser").get_text() # removing html tags if necessary
  text = " ".join([word for word in text.split() if word not in stop_words]) # removal stopwords
  text = " ".join([word for word in text.split() if len(word) > 2]) # removal too short words

  return text

In [41]:
df['text'][0]

'Sana duyduğum arzu, kalbimin en derin köşelerinde saklı bir ateş gibi.'

In [42]:
cleaned_data = [cleaned_text(row) for row in df['text']]

In [44]:
cleaned_data[0]

'sana duyduğum arzu kalbimin derin köşelerinde saklı bir ateş'

In [75]:
cv = CountVectorizer()
X = cv.fit_transform(cleaned_data)
y = df['emotion']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [68]:
rf = RandomForestClassifier()
model = rf.fit(X_train, y_train)
score = model.score(X_test, y_test)

score

0.884460141271443

In [88]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

            Arzu       0.92      0.90      0.91       399
             Aşk       0.92      0.93      0.93       372
Kafa Karışıklığı       0.93      0.89      0.91       480
           Korku       0.96      0.75      0.85       511
        Mutluluk       0.93      0.88      0.91       525
            Nötr       0.88      0.82      0.85       489
         Sarkazm       0.96      0.90      0.93       536
        Suçluluk       0.92      0.93      0.93       512
           Utanç       0.96      0.93      0.94       309
            Öfke       0.91      0.85      0.88       288
          Üzüntü       0.82      0.88      0.85       622
         İğrenme       0.86      0.91      0.88       483
       Şaşkınlık       0.67      0.94      0.78       420

        accuracy                           0.88      5946
       macro avg       0.90      0.89      0.89      5946
    weighted avg       0.89      0.88      0.89      5946



In [102]:
test_poem = input("Duygunuuzu giriniz:")
vec_test = cv.transform([test_poem])
result = model.predict(vec_test)
print(f"{test_poem}: {result}")

Duygunuuzu giriniz:Keşke yapmasaydım
Keşke yapmasaydım: ['Suçluluk']
