In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import openpyxl
from pathlib import Path
import numpy as np
from gensim.models import KeyedVectors
from spacy.tokenizer import Tokenizer
from spacy.lang.tr import Turkish

def cleaner(text):
    final_text = ''
    for word in text.split():
        if word.startswith('@'):
            continue
        elif word[-3:] in ['com', 'org']:
            continue
        elif word.startswith("RT"):
            continue
        elif word.startswith("#"):
            continue
        elif word.startswith('pic') or word.startswith('http') or word.startswith('www'):
            continue
        else:
            final_text += word+' '
    return final_text

word_vectors = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab Notebooks/models/trmodel', binary=True)

def feature_extraction(text):
    vector = np.zeros(400)
    for token in nlp(text):
        word = token.text.lower()
        if word in word_vectors:
            vector+=word_vectors[word]
    return vector

df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/models/cleaned_2650.xlsx") 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sentences = df["tweet"].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.15, random_state=1000)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=1000)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization


vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
w2vec_embeddings = word_vectors.wv.vectors
w2vec_vocab = word_vectors.wv.vocab
w2vec_vocab = list(w2vec_vocab.keys())

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [None]:
embedding_index= {}
for idx in range(0,len(w2vec_embeddings)):
  word = w2vec_vocab[idx]
  embedding_index[word] = list(w2vec_embeddings[idx])

num_tokens = len(voc) + 2
embedding_dim = 400
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
from tensorflow.keras.layers import Embedding
import keras
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(3, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 400)         3369200   
                                                                 
 conv1d_9 (Conv1D)           (None, None, 128)         256128    
                                                                 
 max_pooling1d_6 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 conv1d_10 (Conv1D)          (None, None, 128)         82048     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, None, 128)        0         
 1D)                                                       

In [None]:
x_train = vectorizer(np.array([[s] for s in X_train])).numpy()
x_val = vectorizer(np.array([[s] for s in X_val])).numpy()

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
history = model.fit(x_train, y_train, batch_size=128, epochs=3, validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

In [None]:
x_test = vectorizer(np.array([[s] for s in X_test])).numpy()
#predictions = model.predict(x_test)

In [None]:
predict_x=model.predict(x_test) 
prediction=np.argmax(predict_x,axis=1)

In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_test, prediction))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test, prediction, digits=3))

[[84 43  6]
 [67 89 22]
 [17 25 39]]
              precision    recall  f1-score   support

           0      0.500     0.632     0.558       133
           1      0.567     0.500     0.531       178
           2      0.582     0.481     0.527        81

    accuracy                          0.541       392
   macro avg      0.550     0.538     0.539       392
weighted avg      0.547     0.541     0.540       392

