In [326]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import gensim
import nlpaug
import nlpaug.augmenter.word as naw
import keras
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from keras.optimizers import Adam

In [272]:
stop = list()
file = open('stopwords_pt.txt',encoding='utf-8')
for word in file.read().splitlines():
    stop.append(word)

In [295]:
data = pd.read_csv('OffComBR3.csv', sep=',')

In [274]:
aug_w2v = naw.WordEmbsAug(
    model_type='fasttext', model_path='cbow_s100.txt',
    action="substitute")

In [275]:
text = data.iloc[6]['tweet_text']
text

'Sejam honestos aprovem o projeto original vamos acabar com esta farra no Brasil'

In [279]:
aug_w2v.aug_p=0.2
print("Augmented Text:")
for ii in range(5):
    augmented_text = aug_w2v.augment(text)
    print(augmented_text)

Augmented Text:
Sejam honestos aprovem o projeto original -vamos acabar com quauficada farra no Brasil
Sejam honestos aprovem o projeto logográfica precisávamos acabar com esta farra no Brasil
Sejam honestos aprovem o projeto arquivístico-musical vamos grassar com esta farra no Brasil
Sejam honestos aprovem o projeto origial vamos acabar com esta palhaça no Brasil
Sejam conscienciosos aprovem o projeto original vamos acabar com esta farra outubro.num Brasil


In [276]:
data.shape

(1033, 2)

In [277]:
data.target.value_counts()

0    831
1    202
Name: target, dtype: int64

In [294]:
train,valid=train_test_split(data,test_size=0.20)
print('Shape of Train',train.shape)
print("Shape of Validation ",valid.shape)

Shape of Train (1306, 2)
Shape of Validation  (327, 2)


In [280]:
from sklearn.utils import shuffle

def augment_text(df,samples=300,pr=0.2):
    aug_w2v.aug_p=pr
    new_text=[]
    
    ##dropping samples from validation
    df_n=df[df.target==1].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet_text']
            augmented_text = aug_w2v.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet_text':new_text,'target':1})
    df=shuffle(df.append(new).reset_index(drop=True))
    return df
    

In [296]:
train = augment_text(train,samples=800)
data = train.append(valid).reset_index(drop=True)

100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [02:10<00:00,  4.60it/s]


In [282]:
data.target.value_counts()

0    831
1    802
Name: target, dtype: int64

In [297]:
def create_corpus(data):
    corpus=[]
    for tweet in tqdm(data['tweet_text']):
        words=[word.lower() for word in word_tokenize(tweet, language='portuguese') if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

In [298]:
corpus=create_corpus(data)

100%|████████████████████████████████████████████████████████████████████████████| 2233/2233 [00:00<00:00, 3787.11it/s]


In [299]:
embeddings_index = {}
f = open('cbow_s100.txt',encoding='utf-8')
for line in tqdm(f):
   	values = line.strip().rsplit(' ')
   	word = values[0]
   	coefs = np.asarray(values[1:], dtype='float32')
   	embeddings_index[word] = coefs
f.close()

929606it [01:33, 9896.47it/s] 


In [300]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [301]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 4651


In [302]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embeddings_index.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████████████████████████████████████████████████████████████████████| 4651/4651 [00:00<00:00, 155134.58it/s]


In [303]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [304]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 100)           465200    
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 545,701
Trainable params: 80,501
Non-trainable params: 465,200
_________________________________________________________________


In [305]:
X_train,y_train = tweet_pad[:train.shape[0]],data['target'][:train.shape[0]]
X_test,y_test= tweet_pad[train.shape[0]:],data['target'][train.shape[0]:]

In [309]:
history=model.fit(X_train,y_train,batch_size=8,epochs=10,validation_data=(X_test,y_test),verbose=2, shuffle=True)

Epoch 1/10
239/239 - 44s - loss: 0.5875 - accuracy: 0.6973 - val_loss: 0.6122 - val_accuracy: 0.6575
Epoch 2/10
239/239 - 41s - loss: 0.5570 - accuracy: 0.7219 - val_loss: 0.5781 - val_accuracy: 0.7064
Epoch 3/10
239/239 - 42s - loss: 0.5497 - accuracy: 0.7298 - val_loss: 0.5581 - val_accuracy: 0.7217
Epoch 4/10
239/239 - 44s - loss: 0.5358 - accuracy: 0.7282 - val_loss: 0.5460 - val_accuracy: 0.7401
Epoch 5/10
239/239 - 52s - loss: 0.5384 - accuracy: 0.7361 - val_loss: 0.5363 - val_accuracy: 0.7462
Epoch 6/10
239/239 - 44s - loss: 0.5352 - accuracy: 0.7429 - val_loss: 0.5280 - val_accuracy: 0.7492
Epoch 7/10
239/239 - 46s - loss: 0.5258 - accuracy: 0.7413 - val_loss: 0.5234 - val_accuracy: 0.7584
Epoch 8/10
239/239 - 42s - loss: 0.5229 - accuracy: 0.7466 - val_loss: 0.5134 - val_accuracy: 0.7676
Epoch 9/10
239/239 - 48s - loss: 0.5097 - accuracy: 0.7476 - val_loss: 0.5082 - val_accuracy: 0.7645
Epoch 10/10
239/239 - 44s - loss: 0.5068 - accuracy: 0.7602 - val_loss: 0.5038 - val_accura

In [311]:
y_pre=model.predict(X_test)
y_pre=np.round(y_pre).astype(int).reshape(327)

In [312]:
print(roc_auc_score(y_pre,y_test))

0.7748299319727892


In [327]:
print(cohen_kappa_score(y_test, y_pre))

0.5425549773375861
