In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load your dataset into a pandas DataFrame
# Assuming your dataset is in a CSV file named 'dataset.csv'
dataset = pd.read_csv('train.csv')
dataset = dataset.dropna(subset=['content'])


In [None]:
# Lowercase all the words
dataset['content'] = dataset['content'].str.lower()

In [None]:
# Remove Emojis
import re

def remove_emojis(text):
    if text is None:
        return None
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002500-\U00002BEF"
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# Clean stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    if text is None:
        return None
    words = text.split()
    az_stopwords = stopwords.words('azerbaijani')
    filtered_text = ' '.join([word for word in words if word not in az_stopwords])
    return filtered_text if filtered_text.strip() != '' else None

In [None]:
# Clean text
import math

def clean_text(text):
    if text is None or isinstance(text, float) and math.isnan(text):
        return None
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zğüşıöçəĞÜŞİÖÇƏ\s]', ' ', text)  # Replace punctuations with whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b\d+\b', ' ', text)
    return text.strip() if text.strip() != '' else None



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


pipeline = Pipeline([
    ('cleaner', FunctionTransformer(lambda x: x.apply(clean_text))),
    ('stopwords_remover', FunctionTransformer(lambda x: x.apply(remove_stopwords))),
    ('cleaner_emojies', FunctionTransformer(lambda x: x.apply(remove_emojis))),
    ('dropna', FunctionTransformer(lambda x: x.dropna().reset_index(drop=True)))
])

processed_data = pipeline.fit_transform(dataset['content']).dropna().reset_index(drop=True)

dataset['content'] = processed_data
dataset = dataset.dropna().reset_index(drop=True)

In [None]:
print(len(dataset))

In [None]:
def duplicate_rows(row):
    return pd.concat([row] * row['upvotes'], ignore_index=True)

print(len(dataset))

repeated_indices = dataset.index.repeat(dataset['upvotes'] + 1) 

# Rows with more than 0 upvotes will be repeated 'upvotes' times
dataset = dataset.loc[repeated_indices].reset_index(drop=True) 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 'score' sütununu 5'e bölelim
dataset['sentiment'] = dataset['score'] / 5

# Sentiment değerlerini güncelleyelim
dataset['sentiment'] = dataset['sentiment'].apply(lambda x: 0 if x < 0.5 else 1)

In [None]:
ratings = dataset['sentiment'].values.tolist()
reviews = dataset['content'].values.tolist()

In [None]:
# Split the dataset into training and testing sets
split_point = int(len(reviews) * 0.80)
train_reviews, test_reviews = reviews[:split_point], reviews[split_point:]
train_ratings, test_ratings = ratings[:split_point], ratings[split_point:]

In [None]:
# Tokenize the text data
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

In [None]:
# Display the word index
tokenizer.word_index

In [None]:
# Convert text data to sequences
train_tokens = tokenizer.texts_to_sequences(train_reviews)

In [None]:
# Display tokenized data
print(train_reviews[800])
print(train_tokens[800])

In [None]:
# Tokenize test data
test_tokens = tokenizer.texts_to_sequences(test_reviews)

In [None]:
import numpy as np

# Calculate and display token statistics
num_tokens = [len(tokens) for tokens in train_tokens + test_tokens]
num_tokens = np.array(num_tokens)
print("Mean number of tokens:", np.mean(num_tokens))
print("Max number of tokens:", np.max(num_tokens))
print("Index of max tokens:", np.argmax(num_tokens))

In [None]:
# Set the maximum number of tokens based on mean and standard deviation
max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens))

In [None]:
max_tokens

In [None]:
# Pad the sequences
train_tokens_pad = pad_sequences(train_tokens, maxlen=max_tokens)
test_tokens_pad = pad_sequences(test_tokens, maxlen=max_tokens)

In [None]:
# Display padded data shapes
print("Train tokens shape:", train_tokens_pad.shape)
print("Test tokens shape:", test_tokens_pad.shape)

In [None]:
# Create a mapping from index to word
word_index = tokenizer.word_index
inverse_map = dict(zip(word_index.values(), word_index.keys()))

In [None]:
# Define a function to convert tokens back to text
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = ' '.join(words)
    return text

In [None]:
# Display an example of converting tokens to text
print(train_reviews[800])
print(tokens_to_string(train_tokens[800]))

In [None]:
# Build the sequential model
model = Sequential()

In [None]:
# Set embedding size
embedding_size = 50

In [None]:
# Add embedding layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    name='embedding_layer'))

In [None]:
# Add GRU layers
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Display the model summary
model.summary()

In [None]:
# Train the model
train_ratings = np.array(train_ratings)
model.fit(train_tokens_pad, train_ratings, epochs=10, batch_size=256)