In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load your dataset into a pandas DataFrame
# Assuming your dataset is in a CSV file named 'dataset.csv'
dataset = pd.read_csv('train.csv')
dataset = dataset.dropna(subset=['content'])

glove_path = 'glove.6B.300d.txt'

In [124]:
# Lowercase all the words
dataset['content'] = dataset['content'].str.lower()

In [125]:
# Remove Emojis
import re

def remove_emojis(text):
    if text is None:
        return None
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002500-\U00002BEF"
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [126]:
# Clean stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    if text is None:
        return None
    words = text.split()
    az_stopwords = stopwords.words('azerbaijani')
    filtered_text = ' '.join([word for word in words if word not in az_stopwords])
    return filtered_text if filtered_text.strip() != '' else None

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/merturhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
# Clean text
import math

def clean_text(text):
    if text is None or isinstance(text, float) and math.isnan(text):
        return None
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zğüşıöçəĞÜŞİÖÇƏ\s]', ' ', text)  # Replace punctuations with whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b\d+\b', ' ', text)
    return text.strip() if text.strip() != '' else None



In [128]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


pipeline = Pipeline([
    ('cleaner', FunctionTransformer(lambda x: x.apply(clean_text))),
    ('stopwords_remover', FunctionTransformer(lambda x: x.apply(remove_stopwords))),
    ('cleaner_emojies', FunctionTransformer(lambda x: x.apply(remove_emojis))),
    ('dropna', FunctionTransformer(lambda x: x.dropna().reset_index(drop=True)))
])

processed_data = pipeline.fit_transform(dataset['content']).dropna().reset_index(drop=True)

dataset['content'] = processed_data
dataset = dataset.dropna().reset_index(drop=True)

In [129]:
print(len(dataset))

114934


In [130]:
def duplicate_rows(row):
    return pd.concat([row] * row['upvotes'], ignore_index=True)

print(len(dataset))

repeated_indices = dataset.index.repeat(dataset['upvotes'] + 1) 

# Rows with more than 0 upvotes will be repeated 'upvotes' times
dataset = dataset.loc[repeated_indices].reset_index(drop=True) 

114934


In [131]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [132]:
# 'score' sütununu 5'e bölelim
dataset['sentiment'] = dataset['score'] / 5

# Sentiment değerlerini güncelleyelim
dataset['sentiment'] = dataset['sentiment'].apply(lambda x: 0 if x < 0.5 else 1)

print(dataset['sentiment'][:20])

0     1
1     0
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    0
11    1
12    1
13    1
14    0
15    1
16    1
17    1
18    1
19    1
Name: sentiment, dtype: int64


In [133]:
ratings = dataset['sentiment'].values.tolist()
reviews = dataset['content'].values.tolist()

In [134]:
# Split the dataset into training and testing sets
split_point = int(len(reviews) * 0.80)
train_reviews, test_reviews = reviews[:split_point], reviews[split_point:]
train_ratings, test_ratings = ratings[:split_point], ratings[split_point:]

In [135]:
# Tokenize the text data
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

In [136]:
# Display the word index
tokenizer.word_index

{'əla': 1,
 'super': 2,
 'cox': 3,
 'ela': 4,
 'gözəl': 5,
 'salam': 6,
 'i': 7,
 'pis': 8,
 'proqramdır': 9,
 'praqram': 10,
 'gozel': 11,
 'edirəm': 12,
 'proqram': 13,
 'yaxsidi': 14,
 'olmur': 15,
 'əsəb': 16,
 'oldum': 17,
 'əladı': 18,
 'gicdiyir': 19,
 'sdemir': 20,
 'kankiret': 21,
 'yarata': 22,
 'bilmirsuzse': 23,
 'yaratmiyinda': 24,
 'bəyəndim': 25,
 'ama': 26,
 'rahat': 27,
 'pul': 28,
 'yaxsi': 29,
 'eladi': 30,
 'əladır': 31,
 'superdi': 32,
 've': 33,
 'deyir': 34,
 'verir': 35,
 'bank': 36,
 'yoxdu': 37,
 'ələ': 38,
 'bilmirəm': 39,
 'men': 40,
 'edirem': 41,
 'her': 42,
 'proqramdi': 43,
 'ulduz': 44,
 'yaxşıdı': 45,
 'kart': 46,
 'olmasa': 47,
 'tez': 48,
 'proqramdı': 49,
 'olsa': 50,
 'ne': 51,
 'edin': 52,
 'daxil': 53,
 'bəyədim': 54,
 'yaxşıdır': 55,
 'şey': 56,
 'təşəkkürlər': 57,
 'mükəmməl': 58,
 'işləmir': 59,
 'bilmirem': 60,
 'allah': 61,
 'qeydiyyatdan': 62,
 'ola': 63,
 'necə': 64,
 'gəlmir': 65,
 'normal': 66,
 'nece': 67,
 'internet': 68,
 'edə': 69,
 

In [137]:
# Convert text data to sequences
train_tokens = tokenizer.texts_to_sequences(train_reviews)

In [138]:
# Display tokenized data
print(train_reviews[800])
print(train_tokens[800])

super
[2]


In [139]:
# Tokenize test data
test_tokens = tokenizer.texts_to_sequences(test_reviews)

In [140]:
import numpy as np

# Calculate and display token statistics
num_tokens = [len(tokens) for tokens in train_tokens + test_tokens]
num_tokens = np.array(num_tokens)
print("Mean number of tokens:", np.mean(num_tokens))
print("Max number of tokens:", np.max(num_tokens))
print("Index of max tokens:", np.argmax(num_tokens))

Mean number of tokens: 3.7606986415742387
Max number of tokens: 177
Index of max tokens: 222932


In [141]:
# Set the maximum number of tokens based on mean and standard deviation
max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens))

In [142]:
max_tokens

13

In [143]:
# Pad the sequences
train_tokens_pad = pad_sequences(train_tokens, maxlen=max_tokens)
test_tokens_pad = pad_sequences(test_tokens, maxlen=max_tokens)

In [144]:
# Display padded data shapes
print("Train tokens shape:", train_tokens_pad.shape)
print("Test tokens shape:", test_tokens_pad.shape)

Train tokens shape: (202175, 13)
Test tokens shape: (50544, 13)


In [145]:
# Create a mapping from index to word
word_index = tokenizer.word_index
inverse_map = dict(zip(word_index.values(), word_index.keys()))

In [146]:
# Define a function to convert tokens back to text
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = ' '.join(words)
    return text

In [147]:
# Display an example of converting tokens to text
print(train_reviews[800])
print(tokens_to_string(train_tokens[800]))

super
super


In [148]:
# Build the sequential model
model = Sequential()

In [149]:
# Set embedding size
embedding_size = 50

In [150]:
# Add embedding layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    name='embedding_layer'))

In [151]:
# Add GRU layers
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

In [152]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [153]:
# Display the model summary
model.summary()

In [154]:
# Train the model
train_ratings = np.array(train_ratings)
model.fit(train_tokens_pad, train_ratings, epochs=10, batch_size=256)

Epoch 1/10
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - accuracy: 0.8117 - loss: 0.4750
Epoch 2/10
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.8524 - loss: 0.3690
Epoch 3/10
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8617 - loss: 0.3414
Epoch 4/10
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8676 - loss: 0.3248
Epoch 5/10
[1m259/790[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m7s[0m 14ms/step - accuracy: 0.8721 - loss: 0.3103

In [None]:
# Evaluate the model on the test set
test_ratings = np.array(test_ratings)
evaluation_result = model.evaluate(test_tokens_pad, test_ratings)
print("Test accuracy:", evaluation_result[1])

[1m1580/1580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7788 - loss: 0.5417
Test accuracy: 0.7791429162025452


In [None]:
# Make predictions on a sample of data
sample_predictions = model.predict(x=test_tokens_pad[0:1000]).T[0]
predicted_classes = np.array([1.0 if p > 0.5 else 0.0 for p in sample_predictions])
true_classes = np.array(test_ratings[0:1000])

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [None]:
# Identify incorrect predictions
incorrect_predictions = np.where(predicted_classes != true_classes)[0]
print("Number of incorrect predictions:", len(incorrect_predictions))

Number of incorrect predictions: 220


In [None]:
# Display an example of incorrect prediction
sample_index = incorrect_predictions[0]
print("Index of incorrect prediction:", sample_index)
print("Text:", test_reviews[sample_index])
print("Predicted Rating:", sample_predictions[sample_index])
print("True Rating:", true_classes[sample_index])

Index of incorrect prediction: 17
Text: ela super
Predicted Rating: 0.71103257
True Rating: 0


In [122]:
# Make predictions on new sample texts
new_texts = ["Proqrama girmek olmur donub qalib bu ne meseledi?"]
new_tokens = tokenizer.texts_to_sequences(new_texts)
new_tokens_pad = pad_sequences(new_tokens, maxlen=max_tokens)
print("Predictions for new texts:\n", model.predict(new_tokens_pad))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Predictions for new texts:
 [[0.9297277]]


# GloVe

In [None]:
import tensorflow as tf

# Load the GloVe embeddings
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_matrix = np.zeros((num_words, 300))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector



glove_model = Sequential([
    Embedding(num_words, 300, 
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
              input_length=train_tokens_pad.shape[1], trainable=False),
    GRU(units=16, return_sequences=True),
    GRU(units=8, return_sequences=True),
    GRU(units=4),
    Dense(1, activation='sigmoid')
])

# Compile the model
glove_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
train_ratings = np.array(train_ratings)
glove_model.fit(train_tokens_pad, train_ratings, epochs=5, batch_size=256)

# Word2Vec

In [None]:
import scipy
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from gensim.models import KeyedVectors
 
# Load Word2Vec embeddings
model_path = 'GoogleNews-vectors-negative300.bin'
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=True)
 
embedding_matrix = np.zeros((num_words, word_vectors.vector_size))
for word, i in word_index.items():
    if i < num_words:
        if word in word_vectors:
            embedding_matrix[i] = word_vectors[word]

 
# Define the model
model = Sequential([
    Embedding(num_words, word_vectors.vector_size,
              embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
              input_length=train_tokens_pad.shape[1], trainable=False),
    GRU(units=16, return_sequences=True),
    GRU(units=8, return_sequences=True),
    GRU(units=4),
    Dense(1, activation='sigmoid')
])
 
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
 
# Assuming train_tokens_pad and train_ratings are defined and properly shaped
train_ratings = np.array(train_ratings)
history = model.fit(train_tokens_pad, train_ratings, epochs=5, batch_size=256)

# SVD