# Analiza sentymentu
Implementacja wzorowana na:
https://medium.com/@alyafey22/sentiment-classification-from-keras-to-the-browser-7eda0d87cdc6

In [270]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, RepeatVector, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import save_model
import csv 
import pandas as pd 
from IPython.display import display, HTML
import h5py
from random import shuffle

import re

import operator

### Wczytywanie danych

In [271]:
def load_dataset(file = 'dane_treningowe.csv'):
    with open(file, 'r', encoding='utf-8') as f:
        labels = []
        text = []

        lines = f.readlines()
    shuffle(lines)
    for line in lines:
        data = line.split(',')
        if len(data) == 2:
            labels.append(data[1])
            text.append(data[0].rstrip())
    return text,labels

In [272]:
x_train_text, y_train = load_dataset()
x_train_text = x_train_text[:5000]
y_train = y_train[:5000]
data_text = x_train_text

print(x_train_text[3026], "value: ", y_train[3026])

@brydielonie i miss you girlies lots value:  0



### Preprocessing
Eliminujemy:
* znaki interpunkcyjne (".", "?", itp), ponieważ nie niosą ze sobą wartości emocjonalnej

Zostawiamy:
* "#", "@" ponieważ słowa używane jako twitter handler/hasztag mogą mieć inne znaczenie niż same słowa

In [273]:
def process(txt):
    out = txt
    out = re.sub(r'[.,"!?:*_-]', '', txt)
    out = re.sub('&quot;', '', txt)
    out = out.split()
    out = [word.lower() for word in out]
    return out

In [274]:
print(process('"a" a! a. a, a? a:a a* &quot;a&quot;'))

['"a"', 'a!', 'a.', 'a,', 'a?', 'a:a', 'a*', 'a']


In [275]:
def tokenize(thresh = 2):
    count  = dict()
    idx = 1
    word_index = dict()
    for txt in data_text:
        words = process(txt)
        for word in words:
            if word in count.keys():
                count[word] += 1
            else:
                count[word]  = 1
    most_counts = [word for word in count.keys() if count[word]>=thresh]
    
    sorted_words = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
    
    for word in most_counts:
        word_index[word] = idx
        idx+=1
    return word_index, sorted_words

In [276]:
def getMax(data):
    max_tokens = 0 
    for txt in data:
        if max_tokens < len(txt.split()):
            max_tokens = len(txt.split())
    return max_tokens

In [277]:
max_tokens = getMax(x_train_text)

In [278]:
def create_sequences(data):
    tokens = []
    for txt in data:
        words = process(txt)
        seq = [0] * max_tokens
        i = 0 
        for word in words:
            start = max_tokens-len(words)
            if word.lower() in word_index.keys():
                seq[i+start] = word_index[word]
            i+=1
        tokens.append(seq)        
    return np.array(tokens)

In [279]:
word_index, sorted_words = tokenize()
num_words = len(word_index) + 1
print('length of the dictionary ', len(word_index))

#print(sorted_words)

length of the dictionary  3506


In [280]:
x_train_tokens = create_sequences(x_train_text)

In [281]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

model.add(LSTM(units=16, name = "gru_1",return_sequences=True))
model.add(LSTM(units=8, name = "gru_32" ,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=8, name = "gru_2" ,return_sequences=True))
model.add(LSTM(units=4, name= "gru_3"))
model.add(Dense(1, activation='sigmoid',name="dense_1"))
optimizer = Adam(lr=1e-3)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [282]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 33, 8)             28056     
_________________________________________________________________
gru_1 (LSTM)                 (None, 33, 16)            1600      
_________________________________________________________________
gru_32 (LSTM)                (None, 33, 8)             800       
_________________________________________________________________
dropout_4 (Dropout)          (None, 33, 8)             0         
_________________________________________________________________
gru_2 (LSTM)                 (None, 33, 8)             544       
_________________________________________________________________
gru_3 (LSTM)                 (None, 4)                 208       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total para

In [283]:
y_train_int = [int(i) for i in y_train]

In [284]:
model.fit(x_train_tokens, y_train_int, validation_split=0.05, epochs=5, batch_size=32)

Train on 4750 samples, validate on 250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1c857ef6470>

In [285]:
txt = ["I enjoyed the TV series breaking bad.","Terrible movie","that movie really sucks","I like that movie"]
print(create_sequences(txt)[0])
pred = model.predict(create_sequences(txt))
print('\n prediction for \n',pred[:,0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0   59 1243
   36  989 3326 2711 1278]

 prediction for 
 [0.69567317 0.52344674 0.75153387 0.37607855]
