In [14]:
import pandas as pd

import re
import tqdm
import string
import unidecode
import spacy
import phunspell
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

nltk.download('stopwords')
nlp = spacy.load('ro_core_news_sm')
pspell = phunspell.Phunspell('ro_RO')
stemmer = SnowballStemmer("romanian")

[nltk_data] Downloading package stopwords to /Users/petru-
[nltk_data]     liviubouruc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
data = pd.read_excel('data.xlsx')
train_data, test_data = train_test_split(data, test_size=0.2)
train_data.head()

Unnamed: 0,Text,Response
1528,Minunat și înduioșător! Atitudinea acestui bie...,0
3633,Doar dvs ce mai amintiți de cei care au murit ...,0
717,Acuma vă stă pe creier himenul Mariei. Cei car...,0
1480,"Stimate domnule Cristian Tudor Popescu,vă mulț...",0
3968,Nu mai încape nici o îndoială că statul fură c...,0


In [55]:
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub('[^\w\s]', '', text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [s for s in tokens if not s.isspace()]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    #tokens = [token for token in tokens if token not in string.punctuation]

    # corrected_words = []
    # for token in tokens:
    #     if pspell.lookup(token):
    #         for suggestion in pspell.suggest(token):
    #             corrected_words.append(suggestion)
    #             break
    #     else:
    #         corrected_words.append(stemmer.stem(token))
    # corrected_words = [unidecode.unidecode(word) for word in corrected_words]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Exemplu de folosire
input_text = "Stimate d'le CTPdintotdeauna am știut că geniul și nebunia sunt părți ale aceluiași întreg. Cum așa? Simplu! Teoria enunțată aici de dvs nici măcar nu este originală! Am auzit pt prima dată teoria asta a dezvirginării din interior, precum și ideea imposibilitățiiinseminării pe cale aerieană la mama mea, ex-asistentă medicală, undeva prin anii '70, într-o dispută cu tata pe teme de religie! (Mama o persoanå școlită cam pe vremea Regelui Mihai, absolventă de Postliceală Sanitară; tata mai tânăr cu 7 ani decât ea, țăran din Bărăgan, lăcătuș la bază, dar avsolvent de Ștefan Gheorghiu grație ... \"originii sănătoase\"). Între timp, tata, un om simplu, dar probabil bun și credincios (nu am avut ocazia să îl cunosc prea bine) a murit (56 de ani, infarct); iar mama ....\"trăiește\" (89 de ani) într-un azil psihiatric, fiind diagnosticată (1999) cu schizofrenie paranoidă.P.S. Nu doresc să vă... \"prezic\" nimic (nu-s Mama Omida și oricum detest genul), doar am expus o speță."
preprocessed_text = preprocess_text(input_text)
print(preprocessed_text)


['stimat', 'dle', 'ctpdintotdeaun', 'stiut', 'gen', 'nebun', 'part', 'aceluias', 'intreg', 'simplu', 'teor', 'enunt', 'dvs', 'macar', 'original', 'auzit', 'pt', 'dat', 'teor', 'dezvirginar', 'interior', 'precum', 'ide', 'imposibilitatiiinseminar', 'cal', 'aeriean', 'mam', 'exasistent', 'medic', 'ani', '70', 'intro', 'disput', 'tat', 'tem', 'relig', 'mam', 'persoan', 'scolit', 'vrem', 'regel', 'miha', 'absolvent', 'postliceal', 'sanitar', 'tat', 'tanar', '7', 'ani', 'decat', 'taran', 'baragan', 'lacatus', 'baz', 'avsolvent', 'stefan', 'gheorghiu', 'grat', 'origin', 'sanat', 'timp', 'tat', 'om', 'simplu', 'probabil', 'bun', 'credinc', 'ocaz', 'cunosc', 'bin', 'murit', '56', 'ani', 'infarct', 'mam', 'traiest', '89', 'ani', 'intrun', 'azil', 'psihiatr', 'fiind', 'diagnostic', '1999', 'schizofren', 'paranoidaps', 'doresc', 'prezic', 'nus', 'mam', 'omid', 'detest', 'gen', 'expus', 'spet']


In [49]:
tfidf_vectorizer = TfidfVectorizer(lowercase=False, tokenizer=preprocess_text, token_pattern=None)
train_tfidf  = tfidf_vectorizer.fit_transform(train_data['Text'])
test_tfidf  = tfidf_vectorizer.transform(test_data['Text'])

In [50]:
model = SVC(C=3.5)
model.fit(train_tfidf, train_data['Response'])
print(metrics.classification_report(test_data['Response'], model.predict(test_tfidf)))

              precision    recall  f1-score   support

           0       0.69      0.74      0.72       800
           1       0.61      0.56      0.59       593

    accuracy                           0.66      1393
   macro avg       0.65      0.65      0.65      1393
weighted avg       0.66      0.66      0.66      1393



In [25]:
bow_vectorizer = CountVectorizer(lowercase=False, tokenizer=preprocess_text, token_pattern=None)
train_bow  = bow_vectorizer.fit_transform(train_data['Text'])
test_bow  = bow_vectorizer.transform(test_data['Text'])

In [26]:
model = SVC(C=3.5)
model.fit(train_bow, train_data['Response'])
print(metrics.classification_report(test_data['Response'], model.predict(test_bow)))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       818
           1       0.63      0.55      0.59       575

    accuracy                           0.68      1393
   macro avg       0.67      0.66      0.66      1393
weighted avg       0.68      0.68      0.68      1393



In [51]:
from gensim.models import KeyedVectors

wv_from_bin = KeyedVectors.load_word2vec_format("model_lwcase_no_diac.bin", binary=True)

In [52]:
train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

In [54]:
import pandas as pd
import numpy as np

import json
from nltk.stem import PorterStemmer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, GlobalMaxPool1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_data['Text']))
total_words = len(tokenizer.word_index) + 1

#%%
train_input_seq = tokenizer.texts_to_sequences(train_data['Text'])
test_input_seq = tokenizer.texts_to_sequences(test_data['Text'])

max_sequence_length = max(len(seq) for seq in train_input_seq)

train_padded_seq = pad_sequences(train_input_seq, maxlen=max_sequence_length)
test_padded_seq = pad_sequences(test_input_seq, maxlen=max_sequence_length)

#%%
embedding_dim = 300

embedding_matrix = np.zeros((total_words, embedding_dim))

for word, token in tokenizer.word_index.items():
        if word in wv_from_bin:
                embedding_matrix[token] = wv_from_bin[word]

print("Embedding Matrix Shape:", embedding_matrix.shape)

#%%
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length))
model.add(Bidirectional(LSTM(30, return_sequences=True)))
model.add(Bidirectional(LSTM(30, return_sequences=True)))
model.add(Conv1D(30, 5, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=3),
             ModelCheckpoint("best_model_weights.keras", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)]

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_padded_seq, train_data['Response'], epochs=10, verbose=1, validation_data=(test_padded_seq, test_data['Response']), callbacks=callbacks)
model.evaluate(test_padded_seq, test_data['Response'])

Embedding Matrix Shape: (14325, 300)
Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.64752, saving model to best_model_weights.keras
Epoch 2/10
Epoch 2: val_accuracy improved from 0.64752 to 0.66547, saving model to best_model_weights.keras
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.66547
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.66547
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.66547


[1.4890155792236328, 0.6346015930175781]

In [37]:
model.load_weights("best_model_weights.keras")
print(metrics.classification_report(test_data['Response'], (model.predict(test_padded_seq) >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.69      0.87      0.77       818
           1       0.70      0.45      0.55       575

    accuracy                           0.69      1393
   macro avg       0.70      0.66      0.66      1393
weighted avg       0.69      0.69      0.68      1393



def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    stemmed_tokens = [stemmer.stem(token) for token in text.split()]
    return stemmed_tokens

TF-IDF SVM 0.63, 0.63
BoW SVM 0.63, 0.62
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

TF-IDF SVM 0.65, 0.65
BoW SVM 0.64, 0.63
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]

    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return tokens
    
TF-IDF SVM 0.63, 0.63
BoW SVM 0.64, 0.62
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]

    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return tokens
    
TF-IDF SVM 0.65, 0.64
BoW SVM 0.65, 0.63
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]

    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens 
    
TF-IDF SVM 0.66, 0.66
BoW SVM 0.66, 0.65
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    tokens = [token for token in tokens if token not in string.punctuation]

    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

TF-IDF SVM 0.65, 0.65
BoW SVM 0.65, 0.65
--------------------------------------------
def preprocess_text(text):
    text = text.lower()
    #text = unidecode.unidecode(text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    tokens = [token for token in tokens if token not in string.punctuation]

    corrected_words = []
    for token in tokens:
        if pspell.lookup(token):
            for suggestion in pspell.suggest(token):
                corrected_words.append(suggestion)
                break
        else:
            corrected_words.append(stemmer.stem(token))
 
 TF-IDF SVM 0.63, 0.63 (50 min)
 ----------------------------------------------
 def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub('[^\w\s]', '', text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    #tokens = [token for token in tokens if token not in string.punctuation]

    # corrected_words = []
    # for token in tokens:
    #     if pspell.lookup(token):
    #         for suggestion in pspell.suggest(token):
    #             corrected_words.append(suggestion)
    #             break
    #     else:
    #         corrected_words.append(stemmer.stem(token))
    # corrected_words = [unidecode.unidecode(word) for word in corrected_words]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens
    
TF-IDF SVM 0.68, 0.68 (cu lemma 0.66)
BoW SVM 0.68 0.67

-------------------
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub('[^\w\s]', '', text)
    
    #tokens = word_tokenize(text)
    tokens = [token.text for token in nlp(text)]
    tokens = [s for s in tokens if not s.isspace()]
    tokens = [token for token in tokens if token not in stopwords.words('romanian')]
    #tokens = [token for token in tokens if token not in string.punctuation]

    # corrected_words = []
    # for token in tokens:
    #     if pspell.lookup(token):
    #         for suggestion in pspell.suggest(token):
    #             corrected_words.append(suggestion)
    #             break
    #     else:
    #         corrected_words.append(stemmer.stem(token))
    # corrected_words = [unidecode.unidecode(word) for word in corrected_words]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

Word2Vec nisioi + RNN 0.69

cu lemma si 300 la bilstm -> 0.67
cu lemma si 30 la bilstm -> 0.66
