In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Flatten
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


All tweets 

In [2]:
data_embeddings1 = pd.read_csv("tweets_50k_all.csv", header = None, index_col = False)
data_embeddings1.columns = ["index","text"]

data_embeddings2 = pd.read_csv("tweets_70k_all.csv", header = None, index_col = False)
data_embeddings2.columns = ["index","text"]

In [3]:
data_embeddings = pd.concat([data_embeddings1,data_embeddings2.iloc[1:]])

Labeled tweets

In [4]:
data = pd.read_csv("tweets_50k_labeled.csv", header = None, index_col = False)
data.columns = ["text","label"]

Cleaning all tweets

In [5]:
def cleaning1(tweet):
    return ' '.join(word for word in tweet.split(' ') if not word.startswith('#') and not word.startswith('@') and not "&gt" in word and not 'http' in word and not word.startswith('rt') and not "&amp" in word)

def cleaning2(tweet):
    return tweet.replace(",", " ").replace("..."," ").replace("\""," ").replace("/"," ").replace("."," ").replace(":"," ").replace("!"," ").replace("?"," ").replace(";"," ").replace("-"," ").replace("\r\n"," ").replace("  "," ")

def cleaning3(tweet):
    return tweet.replace("č","c").replace("ć","c").replace("š","s").replace("đ","dj").replace("ž","z")

def cleaning4(tweet):
    return tweet.replace("а","a").replace("б","b").replace("в","v").replace("г","g").replace("д","d").replace("ђ","dj").replace("е","e").replace("ж","z").replace("з","z").replace("и","i").replace("ј","j").replace("к","k").replace("л","l").replace("љ","lj").replace("м","m").replace("н","n").replace("њ","nj").replace("о","o").replace("п","p").replace("р","r").replace("с","s").replace("т","t").replace("ћ","c").replace("у","u").replace("ф","f").replace("х","h").replace("ц","c").replace("ч","c").replace("џ","dz").replace("ш","s")

def cleaning5(tweet):
    if "ā" in tweet or "y" in tweet or "ç" in tweet or "ы" in tweet or "й" in tweet or "ę" in tweet or "ż" in tweet or "ý" in tweet or "ě" in tweet or "á" in tweet or "щ" in tweet or "ь" in tweet or "ą" in tweet or "ю" in tweet or "w" in tweet or "ø" in tweet or "æ" in tweet or "å" in tweet or "я" in tweet:
        return("")
    else:
        return(tweet)
    
def tweet_cleaning(tweet):
    return cleaning5(cleaning4(cleaning3(cleaning2(cleaning1(tweet)))))

In [6]:
data.text = [tweet_cleaning(x.lower()) for x in np.array(data.text)]

In [7]:
data_embeddings.text = [tweet_cleaning(x.lower()) for x in np.array(data_embeddings.text)]

Learning Word2vec model (CBOW based)

In [8]:
sentences = list()
for tweet in data_embeddings['text']:
    sentences.append(tweet.split(sep = " "))

In [9]:
tweet_w2v_cbow = Word2Vec(
        sentences,
        size=300,
        window=5,
        min_count=1,
        workers=10)

In [10]:
tweet_w2v_cbow.most_similar("vucic")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('predsednik', 0.5322154760360718),
 ('on', 0.4874170422554016),
 ('veber', 0.4455837309360504),
 ('srbe', 0.4424019157886505),
 ('si…', 0.4303600490093231),
 ('omalovazavati', 0.4233503043651581),
 ('resenje', 0.41691774129867554),
 ('ribi', 0.4141949713230133),
 ('neko', 0.4055980145931244),
 ('mu', 0.39992427825927734)]

In [11]:
tweet_w2v_cbow.most_similar("djilas")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('vuk', 0.5657670497894287),
 ('jankovic', 0.5298805832862854),
 ('tadic', 0.46134862303733826),
 ('lesinari', 0.44987910985946655),
 ('doslovno', 0.44896531105041504),
 ('uteraju', 0.44756633043289185),
 ('rodoljubivi', 0.4253556430339813),
 ('trikovima', 0.42190003395080566),
 ('bosko', 0.4213244915008545),
 ('haradinaj', 0.4182480573654175)]

In [12]:
tweet_w2v_cbow.predict_output_word("granice ce biti")

[('sad', 0.00019485086),
 ('onda', 0.00012481447),
 ('medija', 0.00011449565),
 ('mozda', 8.968371e-05),
 ('dalje', 8.467133e-05),
 ('oruzjem', 8.122055e-05),
 ('zamrzavanjem', 7.9763886e-05),
 ('to', 7.756317e-05),
 ('zenu', 7.2660136e-05),
 ('pricao', 6.8405534e-05)]

In [13]:
tweet_w2v_cbow.most_similar("kosovo")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('kim', 0.586980938911438),
 ('presevo', 0.5148137211799622),
 ('kosova', 0.48257994651794434),
 ('lideralne', 0.4640258848667145),
 ('on', 0.46144890785217285),
 ('resenje', 0.4501580595970154),
 ('izjavom', 0.4423031806945801),
 ('tvorevina', 0.4342890679836273),
 ('moramo', 0.4251520037651062),
 ('predao', 0.4216660261154175)]

In [14]:
tweet_w2v_cbow.corpus_count

124022

Learning Word2vec model - Skipgram based

In [15]:
tweet_w2v_skipgram = Word2Vec(
        sentences,
        size=300,
        window=5,
        min_count=1,
        workers=10,
        sg = 1)

In [16]:
tweet_w2v_skipgram.most_similar("vucic")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('aleksandar', 0.451048880815506),
 ('pozeleo', 0.4146824777126312),
 ('odmogne', 0.4079938232898712),
 ('kukavica', 0.3963949978351593),
 ('@markodjuric', 0.3907366394996643),
 ('imao', 0.39064130187034607),
 ('istrajno', 0.39010918140411377),
 ('iznese', 0.3899855613708496),
 ('selakovic', 0.3899197280406952),
 ('spava', 0.3898516893386841)]

In [17]:
tweet_w2v_skipgram.most_similar("djilas")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('prevarant', 0.5462846159934998),
 ('pacoli', 0.5268086791038513),
 ('muljator', 0.5240795016288757),
 ('jeremic', 0.5074059963226318),
 ('unustili', 0.5064142942428589),
 ('svestenika', 0.5063611268997192),
 ('ljoticevac', 0.49912264943122864),
 ('ljimaj', 0.4910419285297394),
 ('tajkun', 0.49070703983306885),
 ('fra', 0.48984336853027344)]

In [18]:
pretrained_weights = tweet_w2v_cbow.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [19]:
max_features = vocab_size
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X)

Y = pd.get_dummies(data['label']).values

Training recurrent neural network

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1281, 59) (1281, 2)
(631, 59) (631, 2)


In [21]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim= embedding_size,input_length = X.shape[1], weights=[pretrained_weights]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(5, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)

Epoch 1/20
 - 9s - loss: 0.7013 - acc: 0.5180
Epoch 2/20
 - 9s - loss: 0.6698 - acc: 0.5843
Epoch 3/20
 - 9s - loss: 0.6358 - acc: 0.6124
Epoch 4/20
 - 8s - loss: 0.6033 - acc: 0.6959
Epoch 5/20
 - 8s - loss: 0.5614 - acc: 0.7342
Epoch 6/20
 - 8s - loss: 0.5052 - acc: 0.7639
Epoch 7/20
 - 9s - loss: 0.4499 - acc: 0.8146
Epoch 8/20
 - 8s - loss: 0.3988 - acc: 0.8404
Epoch 9/20
 - 8s - loss: 0.3586 - acc: 0.8630
Epoch 10/20
 - 8s - loss: 0.2905 - acc: 0.8872
Epoch 11/20
 - 8s - loss: 0.2495 - acc: 0.9145
Epoch 12/20
 - 8s - loss: 0.2185 - acc: 0.9270
Epoch 13/20
 - 8s - loss: 0.1556 - acc: 0.9582
Epoch 14/20
 - 8s - loss: 0.1375 - acc: 0.9528
Epoch 15/20
 - 8s - loss: 0.1203 - acc: 0.9684
Epoch 16/20
 - 8s - loss: 0.0898 - acc: 0.9762
Epoch 17/20
 - 8s - loss: 0.0755 - acc: 0.9840
Epoch 18/20
 - 9s - loss: 0.0705 - acc: 0.9848
Epoch 19/20
 - 8s - loss: 0.0635 - acc: 0.9856
Epoch 20/20
 - 8s - loss: 0.0518 - acc: 0.9848


<keras.callbacks.History at 0x29005f4f358>

In [None]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

In [None]:
score, acc

In [None]:
print(model.summary())

In [None]:

model.save('model.h5') 

In [None]:
model = load_model('model.h5')


In [None]:
new_tweet=np.array(['Zemlja napreduje uz naseg predsednika','Ova naprednjacka vlast je katastrofa','Zuti lopovi opet hoce vlast'])
new_tweet=(tokenizer.texts_to_sequences(new_tweet))
new_tweet=pad_sequences(new_tweet,maxlen=X.shape[1])
new_tweet.shape

In [None]:
model.predict(new_tweet)

Training neural network using mean summarization

In [None]:
X1_mean_summarized = pd.DataFrame()
for sentence in X:
    X1_mean_summarized = X1_mean_summarized.append(pd.DataFrame(pretrained_weights[sentence[sentence > 0]].mean(axis = 0)).transpose())
X_mean_summarized = X1_mean_summarized.dropna()

In [None]:
X_mean_summarized_train, X_mean_summarized_test, Y_mean_summarized_train, Y_mean_summarized_test = train_test_split(X_mean_summarized,Y[X1_mean_summarized.max(axis=1)>0], test_size = 0.33, random_state = 42)

In [None]:
model = Sequential()
model.add(Dense(20,activation = "relu", input_dim = 300))
model.add(Dense(2,activation = "softmax"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
batch_size = 32
model.fit(X_mean_summarized_train, Y_mean_summarized_train, epochs = 100, batch_size=batch_size, verbose = 2)

In [None]:
score,acc = model.evaluate(X_mean_summarized_test, Y_mean_summarized_test, verbose = 2, batch_size = batch_size)

In [None]:
score, acc

Training neural network using max summarization

In [None]:
X1_max_summarized = pd.DataFrame()
for sentence in X[X1_mean_summarized.max(axis=1)>0]:
    X1_max_summarized = X1_max_summarized.append(pd.DataFrame(pretrained_weights[sentence[sentence > 0]].max(axis = 0)).transpose())
X_max_summarized = X1_max_summarized.dropna()

In [None]:
X_max_summarized_train, X_max_summarized_test, Y_max_summarized_train, Y_max_summarized_test = train_test_split(X_max_summarized,Y[X1_mean_summarized.max(axis=1)>0], test_size = 0.33, random_state = 42)

In [None]:
model = Sequential()
model.add(Dense(20,activation = "relu", input_dim = 300))
model.add(Dense(2,activation = "softmax"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
batch_size = 32
model.fit(X_max_summarized_train, Y_max_summarized_train, epochs = 100, batch_size=batch_size, verbose = 2)

In [None]:
score,acc = model.evaluate(X_max_summarized_test, Y_max_summarized_test, verbose = 2, batch_size = batch_size)

In [None]:
score,acc