In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping





In [4]:
data = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1')

In [5]:
data = data.rename(columns={'0':'Sentiment',
                     '1467810369': 'Tweet ID',
                     'Mon Apr 06 22:19:45 PDT 2009':'Date',
                     'NO_QUERY':'Query',
                     '_TheSpecialOne_':'Username',
                     "@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D":'Tweet'})

In [6]:
def Sentiment_remap(x):
    if x == 0:
        return 0
    else:
        return 1

In [7]:
data['Sentiment'] = data['Sentiment'].apply(Sentiment_remap)

In [8]:
def username_removal(string):
    
    for i in range(string.count('@')): 
        username_begin_index = string.find('@')
        username_end_index = string[username_begin_index::].find(' ') + username_begin_index
    
        string = string.replace(string[username_begin_index:username_end_index+1],' ')
    
    return string                    

In [9]:
data['Tweet'] = data['Tweet'].apply(username_removal)

In [10]:
x = data['Tweet']
y = data['Sentiment']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.01, test_size = 0.01, stratify=y, random_state=7)

In [12]:
tokenizer = Tokenizer(num_words=10000)

In [13]:
tokenizer.fit_on_texts(x_train)

In [14]:
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [15]:
x_train_seq

[[43, 24, 320, 2, 278, 394, 8, 2394, 99, 3885, 433, 21, 47, 83, 41],
 [38, 449, 61, 7483, 7, 306, 389, 26],
 [38,
  159,
  61,
  7484,
  1,
  25,
  86,
  13,
  3,
  213,
  1573,
  9,
  25,
  20,
  1670,
  18,
  373,
  10,
  7,
  223,
  88,
  211,
  410,
  2746],
 [162, 41, 336, 23, 39, 18, 48, 1175, 10, 9, 1463, 247, 92, 220, 7485],
 [7486],
 [972, 192, 8, 42, 31, 133, 1, 66],
 [134, 1047, 401, 29, 24, 42, 5078, 7487, 13, 146, 89, 109, 4, 217, 153],
 [1,
  182,
  341,
  76,
  166,
  2,
  1952,
  7,
  775,
  71,
  4,
  131,
  478,
  52,
  4,
  314,
  12,
  304,
  5079],
 [1, 68, 19, 361, 166, 21, 227],
 [66, 177, 6, 720, 18, 7488, 49, 68, 155, 8, 11, 1301, 6, 73, 7489, 3213],
 [3886,
  23,
  3,
  868,
  311,
  10,
  2,
  35,
  97,
  1,
  48,
  11,
  260,
  2,
  1574,
  107,
  6,
  1,
  75,
  221,
  107],
 [1464, 29, 73, 1575, 663, 1, 172, 9, 21, 4, 3887],
 [8, 311, 10, 3, 7490, 2, 1573, 11, 352],
 [3, 457, 2, 7491, 105, 109, 1953, 1, 57, 16, 308, 105, 4, 895, 10, 373],
 [79, 1, 85, 17, 

In [16]:
max_len = 100
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='post')


In [17]:
x_train_pad

array([[ 43,  24, 320, ...,   0,   0,   0],
       [ 38, 449,  61, ...,   0,   0,   0],
       [ 38, 159,  61, ...,   0,   0,   0],
       ...,
       [ 80,   0,   0, ...,   0,   0,   0],
       [ 48,  89,  52, ...,   0,   0,   0],
       [  3, 510,   8, ...,   0,   0,   0]])

# MODEL 1 - Potpuno povezane neuronske mreze



In [None]:
# import tensorflow as tf
# tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.4),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history = model.fit(x_train_pad, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

In [None]:
loss, accuracy = model.evaluate(x_test_pad, y_test)

In [None]:
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')