In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Flatten
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


All tweets 

In [2]:
data_embeddings1 = pd.read_csv("tweets_50k_all.csv", header = None, index_col = False)
data_embeddings1.columns = ["index","text"]

data_embeddings2 = pd.read_csv("tweets_70k_all.csv", header = None, index_col = False)
data_embeddings2.columns = ["index","text"]

In [3]:
data_embeddings = pd.concat([data_embeddings1,data_embeddings2.iloc[1:]])

Labeled tweets

In [4]:
data = pd.read_csv("tweets_50k_labeled.csv", header = None, index_col = False)
data.columns = ["text","label"]

Cleaning all tweets

In [5]:
def cleaning1(tweet):
    return ' '.join(word for word in tweet.split(' ') if not word.startswith('#') and not word.startswith('@') and not "&gt" in word and not 'http' in word and not word.startswith('rt') and not "&amp" in word)

def cleaning2(tweet):
    return tweet.replace(",", " ").replace("..."," ").replace("\""," ").replace("/"," ").replace("."," ").replace(":"," ").replace("!"," ").replace("?"," ").replace(";"," ").replace("-"," ").replace("\r\n"," ").replace("  "," ")

def cleaning3(tweet):
    return tweet.replace("č","c").replace("ć","c").replace("š","s").replace("đ","dj").replace("ž","z")

def cleaning4(tweet):
    return tweet.replace("а","a").replace("б","b").replace("в","v").replace("г","g").replace("д","d").replace("ђ","dj").replace("е","e").replace("ж","z").replace("з","z").replace("и","i").replace("ј","j").replace("к","k").replace("л","l").replace("љ","lj").replace("м","m").replace("н","n").replace("њ","nj").replace("о","o").replace("п","p").replace("р","r").replace("с","s").replace("т","t").replace("ћ","c").replace("у","u").replace("ф","f").replace("х","h").replace("ц","c").replace("ч","c").replace("џ","dz").replace("ш","s")

def cleaning5(tweet):
    if "ā" in tweet or "y" in tweet or "ç" in tweet or "ы" in tweet or "й" in tweet or "ę" in tweet or "ż" in tweet or "ý" in tweet or "ě" in tweet or "á" in tweet or "щ" in tweet or "ь" in tweet or "ą" in tweet or "ю" in tweet or "w" in tweet or "ø" in tweet or "æ" in tweet or "å" in tweet or "я" in tweet:
        return("")
    else:
        return(tweet)
    
def tweet_cleaning(tweet):
    return cleaning5(cleaning4(cleaning3(cleaning2(cleaning1(tweet)))))

In [6]:
data.text = [tweet_cleaning(x.lower()) for x in np.array(data.text)]

In [7]:
data_embeddings.text = [tweet_cleaning(x.lower()) for x in np.array(data_embeddings.text)]

Learning Word2vec model (CBOW based)

In [8]:
sentences = list()
for tweet in data_embeddings['text']:
    sentences.append(tweet.split(sep = " "))

In [9]:
tweet_w2v_cbow = Word2Vec(
        sentences,
        size=300,
        window=5,
        min_count=1,
        workers=10)

In [10]:
tweet_w2v_cbow.most_similar("vucic")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('predsednik', 0.5282397270202637),
 ('on', 0.44468262791633606),
 ('busas', 0.44397565722465515),
 ('ejjj', 0.42289167642593384),
 ('vuci', 0.41923248767852783),
 ('teniserima', 0.40883034467697144),
 ('pojedinac', 0.40017959475517273),
 ('bonus', 0.3976319134235382),
 ('pegovaranja', 0.3960815966129303),
 ('konfliktom', 0.3928396999835968)]

In [11]:
tweet_w2v_cbow.most_similar("djilas")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('vuk', 0.5676205158233643),
 ('jankovic', 0.530922532081604),
 ('tadic', 0.5125159025192261),
 ('fra', 0.4520730972290039),
 ('vidljivo', 0.4389224648475647),
 ('pricali', 0.4286049008369446),
 ('#koridor10', 0.42621392011642456),
 ('pobjegao', 0.42338794469833374),
 ('jeremic', 0.42196008563041687),
 ('dragoljub', 0.41910508275032043)]

In [30]:
tweet_w2v_cbow.predict_output_word("granice ce biti")

[('sad', 0.00018446485),
 ('onda', 0.0001317382),
 ('mozda', 0.000107097316),
 ('dalje', 0.00010132005),
 ('opet', 9.2706265e-05),
 ('to', 8.89448e-05),
 ('e', 7.846637e-05),
 ('pricao', 7.54253e-05),
 ('sada', 7.265975e-05),
 ('cak', 7.237755e-05)]

In [21]:
tweet_w2v_cbow.most_similar("kosovo")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('kim', 0.5982123613357544),
 ('kosova', 0.5309165716171265),
 ('albanci', 0.47003114223480225),
 ('nezavisnost', 0.4641479551792145),
 ('slobodu', 0.460249125957489),
 ('on', 0.4456906318664551),
 ('presevo', 0.42784178256988525),
 ('nadje', 0.42449072003364563),
 ('suverenitet', 0.42308902740478516),
 ('izjavom', 0.4158027172088623)]

In [22]:
tweet_w2v_cbow.corpus_count

124022

Learning Word2vec model - Skipgram based

In [23]:
tweet_w2v_skipgram = Word2Vec(
        sentences,
        size=300,
        window=5,
        min_count=1,
        workers=10,
        sg = 1)

In [24]:
tweet_w2v_skipgram.most_similar("vucic")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('aleksandar', 0.4614197015762329),
 ('hrabrosti', 0.42275193333625793),
 ('namerom', 0.40772008895874023),
 ('cestitao', 0.40470972657203674),
 ('iznese', 0.4021710455417633),
 ('decji', 0.4000522792339325),
 ('neuporedivo', 0.3986503779888153),
 ('ugnjaviti', 0.398624062538147),
 ('setio', 0.39699786901474),
 ('ciljevi', 0.39560672640800476)]

In [25]:
tweet_w2v_skipgram.most_similar("djilas")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('tajkun', 0.5409117341041565),
 ('jeremic', 0.5390247702598572),
 ('pacoli', 0.5037994384765625),
 ('obradovic', 0.49901652336120605),
 ('ljoticevac', 0.49853822588920593),
 ('gojkovica', 0.49484890699386597),
 ('svestenika', 0.48786216974258423),
 ('stefanovic', 0.4867296516895294),
 ('muljator', 0.483015239238739),
 ('milenko', 0.4827648997306824)]

In [27]:
pretrained_weights = tweet_w2v_cbow.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [28]:
max_features = vocab_size
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X)

Y = pd.get_dummies(data['label']).values

Training recurrent neural network

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1281, 59) (1281, 2)
(631, 59) (631, 2)


In [34]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim= embedding_size,input_length = X.shape[1], weights=[pretrained_weights]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(5, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [35]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)

Epoch 1/20
 - 22s - loss: 0.7285 - acc: 0.4945
Epoch 2/20
 - 20s - loss: 0.6987 - acc: 0.5406
Epoch 3/20
 - 20s - loss: 0.6928 - acc: 0.5390
Epoch 4/20
 - 20s - loss: 0.6726 - acc: 0.6007
Epoch 5/20
 - 21s - loss: 0.6634 - acc: 0.5913
Epoch 6/20
 - 20s - loss: 0.6510 - acc: 0.6272
Epoch 7/20
 - 20s - loss: 0.6337 - acc: 0.6577
Epoch 8/20
 - 20s - loss: 0.5977 - acc: 0.7045
Epoch 9/20
 - 20s - loss: 0.5769 - acc: 0.7123
Epoch 10/20
 - 21s - loss: 0.5329 - acc: 0.7521
Epoch 11/20
 - 20s - loss: 0.4974 - acc: 0.7818
Epoch 12/20
 - 21s - loss: 0.4337 - acc: 0.8138
Epoch 13/20
 - 21s - loss: 0.3661 - acc: 0.8599
Epoch 14/20
 - 21s - loss: 0.3492 - acc: 0.8630
Epoch 15/20
 - 21s - loss: 0.2979 - acc: 0.8880
Epoch 16/20
 - 21s - loss: 0.2520 - acc: 0.9075
Epoch 17/20
 - 21s - loss: 0.2264 - acc: 0.9184
Epoch 18/20
 - 22s - loss: 0.1909 - acc: 0.9325
Epoch 19/20
 - 21s - loss: 0.1637 - acc: 0.9465
Epoch 20/20
 - 20s - loss: 0.1630 - acc: 0.9387


<keras.callbacks.History at 0x10ce6080>

In [36]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

In [37]:
score, acc

(0.6171741875718022, 0.7369255141108614)

In [7]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 59, 300)           14569800  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 59, 300)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 5)                 6120      
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 12        
Total params: 14,575,932
Trainable params: 14,575,932
Non-trainable params: 0
_________________________________________________________________
None


In [52]:

model.save('model.h5') 

In [3]:
model = load_model('model.h5')


In [23]:
new_tweet=np.array(['Zemlja napreduje uz naseg predsednika','Ova vlast je katastrofa','Zuti lopovi opet hoce vlast'])
new_tweet=(tokenizer.texts_to_sequences(new_tweet))
new_tweet=pad_sequences(new_tweet,maxlen=X.shape[1])
new_tweet.shape

(3, 59)

In [25]:
model.predict(new_tweet)

array([[0.00764186, 0.99235815],
       [0.83877677, 0.16122325],
       [0.12373045, 0.8762695 ]], dtype=float32)

Training neural network using mean summarization

In [24]:
X1_mean_summarized = pd.DataFrame()
for sentence in X:
    X1_mean_summarized = X1_mean_summarized.append(pd.DataFrame(pretrained_weights[sentence[sentence > 0]].mean(axis = 0)).transpose())
X_mean_summarized = X1_mean_summarized.dropna()

  This is separate from the ipykernel package so we can avoid doing imports until
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [26]:
X_mean_summarized_train, X_mean_summarized_test, Y_mean_summarized_train, Y_mean_summarized_test = train_test_split(X_mean_summarized,Y[X1_mean_summarized.max(axis=1)>0], test_size = 0.33, random_state = 42)

In [27]:
model = Sequential()
model.add(Dense(20,activation = "relu", input_dim = 300))
model.add(Dense(2,activation = "softmax"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [28]:
batch_size = 32
model.fit(X_mean_summarized_train, Y_mean_summarized_train, epochs = 20, batch_size=batch_size, verbose = 2)

Epoch 1/20
 - 0s - loss: 0.6659 - acc: 0.5789
Epoch 2/20
 - 0s - loss: 0.6189 - acc: 0.6484
Epoch 3/20
 - 0s - loss: 0.5944 - acc: 0.6744
Epoch 4/20
 - 0s - loss: 0.5812 - acc: 0.6894
Epoch 5/20
 - 0s - loss: 0.5620 - acc: 0.7084
Epoch 6/20
 - 0s - loss: 0.5503 - acc: 0.7170
Epoch 7/20
 - 0s - loss: 0.5379 - acc: 0.7313
Epoch 8/20
 - 0s - loss: 0.5252 - acc: 0.7368
Epoch 9/20
 - 0s - loss: 0.5227 - acc: 0.7313
Epoch 10/20
 - 0s - loss: 0.5109 - acc: 0.7407
Epoch 11/20
 - 0s - loss: 0.5039 - acc: 0.7486
Epoch 12/20
 - 0s - loss: 0.4924 - acc: 0.7589
Epoch 13/20
 - 0s - loss: 0.4917 - acc: 0.7502
Epoch 14/20
 - 0s - loss: 0.4764 - acc: 0.7794
Epoch 15/20
 - 0s - loss: 0.4715 - acc: 0.7715
Epoch 16/20
 - 0s - loss: 0.4682 - acc: 0.7597
Epoch 17/20
 - 0s - loss: 0.4588 - acc: 0.7865
Epoch 18/20
 - 0s - loss: 0.4494 - acc: 0.7928
Epoch 19/20
 - 0s - loss: 0.4445 - acc: 0.7873
Epoch 20/20
 - 0s - loss: 0.4387 - acc: 0.7920


<keras.callbacks.History at 0x1a7b6ac8>

In [29]:
score,acc = model.evaluate(X_mean_summarized_test, Y_mean_summarized_test, verbose = 2, batch_size = batch_size)

In [30]:
score, acc

(0.5935092767715454, 0.7008000004768371)

Training neural network using max summarization

In [31]:
X1_max_summarized = pd.DataFrame()
for sentence in X[X1_mean_summarized.max(axis=1)>0]:
    X1_max_summarized = X1_max_summarized.append(pd.DataFrame(pretrained_weights[sentence[sentence > 0]].max(axis = 0)).transpose())
X_max_summarized = X1_max_summarized.dropna()

In [32]:
X_max_summarized_train, X_max_summarized_test, Y_max_summarized_train, Y_max_summarized_test = train_test_split(X_max_summarized,Y[X1_mean_summarized.max(axis=1)>0], test_size = 0.33, random_state = 42)

In [12]:
model = Sequential()
model.add(Dense(20,activation = "relu", input_dim = 300))
model.add(Dense(2,activation = "softmax"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [14]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 20)                6020      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 42        
Total params: 6,062
Trainable params: 6,062
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
batch_size = 32
model.fit(X_max_summarized_train, Y_max_summarized_train, epochs = 100, batch_size=batch_size, verbose = 2)

Epoch 1/100
 - 0s - loss: 0.4827 - acc: 0.7778
Epoch 2/100
 - 0s - loss: 0.4811 - acc: 0.7699
Epoch 3/100
 - 0s - loss: 0.4749 - acc: 0.7810
Epoch 4/100
 - 0s - loss: 0.4717 - acc: 0.7849
Epoch 5/100
 - 0s - loss: 0.4623 - acc: 0.7952
Epoch 6/100
 - 0s - loss: 0.4833 - acc: 0.7778
Epoch 7/100
 - 0s - loss: 0.4648 - acc: 0.7810
Epoch 8/100
 - 0s - loss: 0.4760 - acc: 0.7747
Epoch 9/100
 - 0s - loss: 0.4707 - acc: 0.7794
Epoch 10/100
 - 0s - loss: 0.4684 - acc: 0.7881
Epoch 11/100
 - 0s - loss: 0.4545 - acc: 0.7857
Epoch 12/100
 - 0s - loss: 0.4586 - acc: 0.7897
Epoch 13/100
 - 0s - loss: 0.4697 - acc: 0.7762
Epoch 14/100
 - 0s - loss: 0.4483 - acc: 0.7936
Epoch 15/100
 - 0s - loss: 0.4690 - acc: 0.7668
Epoch 16/100
 - 0s - loss: 0.4380 - acc: 0.8086
Epoch 17/100
 - 0s - loss: 0.4370 - acc: 0.8039
Epoch 18/100
 - 0s - loss: 0.4566 - acc: 0.7865
Epoch 19/100
 - 0s - loss: 0.4467 - acc: 0.7904
Epoch 20/100
 - 0s - loss: 0.4346 - acc: 0.8062
Epoch 21/100
 - 0s - loss: 0.4395 - acc: 0.8047
E

<keras.callbacks.History at 0x415b93c8>

In [38]:
score,acc = model.evaluate(X_max_summarized_test, Y_max_summarized_test, verbose = 2, batch_size = batch_size)

In [39]:
score,acc

(0.7004182025909423, 0.7008000005722046)