In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import seaborn as sns
from sklearn.pipeline import Pipeline



In [None]:
# Loading the dataset
dataset = pd.read_csv('train.csv')

In [None]:
import re

# Emoji Cleaner Regex
def remove_emojis(text):
    if text is None:  # Check if the text is None
        return None
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zğüşıöçəĞÜŞİÖÇƏ ]', '', text)  # Keep Azerbaijani Turkish letters and whitespace
    return text if text.strip() != '' else None  # Return None if text is empty

# Define the stopwords removal function
def remove_stopwords(text):
    if text is None:  # Check if the text is None
        return None
    words = text.split()
    turkish_stopwords = stopwords.words('turkish')
    filtered_text = ' '.join([word for word in words if word not in turkish_stopwords])
    return filtered_text if filtered_text.strip() != '' else None  # Return None if filtered text is empty



# Clear emojis and make all lowercased
dataset['content'] = dataset['content'].apply(lambda x: remove_emojis(str(x)).lower())
# Filter rows where 'content' column contains Turkish or Azerbaijani characters, 
dataset = dataset[dataset['content'].apply(lambda x: bool(clean_text(x)))]


# Reset index after filtering
dataset.reset_index(drop=True, inplace=True)

In [None]:
# Seperating contents, scores and upvotes
contents = dataset['content'].values.tolist()
scores = dataset['score'].values.tolist()
upvotes = dataset['upvotes'].values.tolist()

# Setting the style
plt.style.use('ggplot')

# Creating a figure with two subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Plotting histogram for 'score'
axes[0].hist(scores, bins=range(1, 7), edgecolor='black', color='skyblue')
axes[0].set_title('Distribution of Sentiment Scores')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')

# Plotting histogram for 'upvotes'
axes[1].hist(upvotes, bins=range(1, 100), edgecolor='black', color='lightgreen')
axes[1].set_title('Distribution of Upvotes')
axes[1].set_xlabel('Upvotes')
axes[1].set_ylabel('Frequency')

# Tight layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
score_upvotes_means = dataset.groupby('score')['upvotes'].mean()

# Çubuk grafiği oluşturma
plt.figure(figsize=(10, 6))
sns.barplot(x=score_upvotes_means.index, y=score_upvotes_means.values, palette='viridis')
plt.title('Average Upvotes by Sentiment Score')
plt.xlabel('Sentiment Score')
plt.ylabel('Average Upvotes')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

pipeline = Pipeline([
    ('cleaner', FunctionTransformer(lambda x: x.apply(clean_text))),
    ('stopwords_remover', FunctionTransformer(lambda x: x.apply(remove_stopwords))),
    ('cleaner_emojies', FunctionTransformer(lambda x: x.apply(remove_emojis)))  # Adding vectorizer to convert text into a numeric format
])

# Assuming 'dataset' is your DataFrame and it contains the 'content' column
processed_data = pipeline.fit_transform(dataset['content'])

# Filter out None values and split the data
# 'processed_data' is a sparse matrix, need to convert dataset['score'] accordingly
valid_indices = [i for i, text in enumerate(dataset['content']) if text is not None]
X = processed_data[valid_indices]  # Filter the processed data
y = dataset['score'].iloc[valid_indices]  # Filter the target variable accordingly

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)




def tokenize_text_nltk(text):
    if text is None:
        return None
    return word_tokenize(text, language='turkish')

# Eğitim ve test verilerini tokenleme ve None değerleri filtreleme
X_train_tokenized = X_train.apply(lambda x: tokenize_text_nltk(x) if x is not None else None)
X_test_tokenized = X_test.apply(lambda x: tokenize_text_nltk(x) if x is not None else None)

# None değerlerini filtreleyerek saf veri setlerini elde et
X_train_filtered = X_train_tokenized[X_train_tokenized.notnull()]
y_train_filtered = y_train[X_train_tokenized.notnull()]
X_test_filtered = X_test_tokenized[X_test_tokenized.notnull()]
y_test_filtered = y_test[X_test_tokenized.notnull()]

# İlk birkaç tokenleşmiş örnekleri göster
print("Tokenized Training Data Examples:", X_train_filtered.head())
print("Tokenized Test Data Examples:", X_test_filtered.head())

# Tokenizer'ı oluştur
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_filtered)

# Tokenizer'ı kullanarak eğitim ve test verilerini dönüştür
X_train_sequences = tokenizer.texts_to_sequences(X_train_filtered)
X_test_sequences = tokenizer.texts_to_sequences(X_test_filtered)

# Eğitim ve test verilerini sabit bir uzunluğa doldur
X_train_padded = pad_sequences(X_train_sequences, padding='post')
X_test_padded = pad_sequences(X_test_sequences, padding='post', maxlen=X_train_padded.shape[1])

# Tokenizer'ın kelime indekslerini ve kelime sayısını al
word_index = tokenizer.word_index
num_words = len(word_index) + 1



# Implement models using pre-trained GloVe embeddings
# Load the GloVe embeddings
embeddings_index = {}
with open('/Users/merturhan/Downloads/glove/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Create a Sequential model
model = Sequential([
    Embedding(num_words, 100, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
              input_length=X_train_padded.shape[1], trainable=False),
    GRU(128, return_sequences=True),
    GRU(128),
    Dense(128, activation='relu'),
    Dense(6, activation='softmax')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Train the model
history = model.fit(X_train_padded, y_train_filtered, epochs=3, batch_size=32, validation_data=(X_test_padded, y_test_filtered))

# Plot the training and validation accuracy
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy', color='blue')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='red')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

