In [161]:
import json
import tensorflow as tf
import csv
import random
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers


In [162]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size=3000
test_portion=.1

corpus = []

In [163]:
df = pd.read_csv('3000tweet.csv',sep=';',encoding="utf8" ,error_bad_lines=False)
df.head()

Unnamed: 0,tweet,durum
0,dun Turkcelle tepkilerimizden sonra bugün Turk...,1
1,girmezmiyim.. Turkcell kartim bile var.. Yarin...,1
2,tam tünelden gecerken 3g cekiyordu :D türkcell...,1
3,turkcell superonline fiber internet veya ADSL ...,1
4,bence Gnçtrkcll Ark Winterfest 2012'de 1.olur ...,1


In [164]:
num_sentences = 0
with open("3000tweet.csv", encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=';')
    for row in reader:
        list_item=[]
        list_item.append(row[0])
        this_label=row[1]
        if this_label=='1':
            list_item.append(0)
        else:
            list_item.append(1)
        num_sentences = num_sentences + 1
        corpus.append(list_item)

In [165]:
print(num_sentences)
print(len(corpus))
print(corpus[1])

3001
3001
['dun Turkcelle tepkilerimizden sonra bugün Turkcell twittera sponsor olmuş. Ne tesadüf değil mi ? :)', 0]


In [166]:
sentences=[]
labels=[]

random.shuffle(corpus)
for x in range(training_size):
    sentences.append(corpus[x][0])
    labels.append(corpus[x][1])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size=len(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
split = int(test_portion * training_size)
test_sequences = padded[0:split]
training_sequences = padded[split:training_size]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

In [167]:
print(vocab_size)
print(word_index['i'])

12415
116


In [168]:
embeddings_index = {};
with open('glove.6B.100d.txt' , encoding='utf-8') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [169]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 16, 100)           1241600   
_________________________________________________________________
dropout_9 (Dropout)          (None, 16, 100)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 12, 64)            32064     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 3, 64)             0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 1,306,753
Trainable params: 65,153
Non-trainable params: 1,241,600
_______________________________________

In [170]:
num_epochs = 50
history=model.fit(np.array(training_sequences), np.array(training_labels), epochs=num_epochs, validation_data=(np.array(test_sequences), np.array(test_labels)), verbose=2)

print("Training Complete")

Train on 2700 samples, validate on 300 samples
Epoch 1/50
2700/2700 - 3s - loss: 0.5729 - accuracy: 0.7422 - val_loss: 0.5507 - val_accuracy: 0.7467
Epoch 2/50
2700/2700 - 0s - loss: 0.5452 - accuracy: 0.7515 - val_loss: 0.5343 - val_accuracy: 0.7600
Epoch 3/50
2700/2700 - 0s - loss: 0.5157 - accuracy: 0.7600 - val_loss: 0.5374 - val_accuracy: 0.7567
Epoch 4/50
2700/2700 - 0s - loss: 0.4794 - accuracy: 0.7763 - val_loss: 0.5354 - val_accuracy: 0.7733
Epoch 5/50
2700/2700 - 0s - loss: 0.4223 - accuracy: 0.8030 - val_loss: 0.6065 - val_accuracy: 0.7733
Epoch 6/50
2700/2700 - 0s - loss: 0.3806 - accuracy: 0.8289 - val_loss: 0.5868 - val_accuracy: 0.7400
Epoch 7/50
2700/2700 - 0s - loss: 0.3324 - accuracy: 0.8556 - val_loss: 0.6859 - val_accuracy: 0.7500
Epoch 8/50
2700/2700 - 0s - loss: 0.2808 - accuracy: 0.8811 - val_loss: 0.7442 - val_accuracy: 0.7300
Epoch 9/50
2700/2700 - 0s - loss: 0.2433 - accuracy: 0.8944 - val_loss: 0.8044 - val_accuracy: 0.6700
Epoch 10/50
2700/2700 - 0s - loss: 