In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [4]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from snowballstemmer import TurkishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd

# Function for text preprocessing
def preprocess_text_turkish(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text, language='turkish')  # Tokenize in Turkish
    stop_words = set(stopwords.words('turkish'))  # Turkish stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    stemmer = TurkishStemmer()  # Turkish stemmer
    tokens = [stemmer.stemWord(word) for word in tokens]  # Stemming
    return ' '.join(tokens)

# Load the dataset (adjust the path as needed)
data_path = 'Data.xlsx'  # Replace with your file path
data = pd.read_excel(data_path, sheet_name='Data')

# Preprocess the dataset
data['Clean_Comment_Turkish'] = data['Comment'].apply(preprocess_text_turkish)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Clean_Comment_Turkish'])
y = data['Topic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm_classifier = SVC()

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Classifier: Accuracy = {accuracy:.2f}")


SVM Classifier: Accuracy = 0.88


In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Assuming 'data' is your DataFrame with raw text and labels
X_raw = data['Comment']  # Replace with your column name
y_raw = data['Topic']  # Replace with your column name

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_raw)
X_seq = tokenizer.texts_to_sequences(X_raw)
X_padded = pad_sequences(X_seq, padding='post', maxlen=50)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_raw)
y_categorical = to_categorical(y_encoded)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

# Neural Network Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=50),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_categorical.shape[1], activation='softmax')  # Use 'sigmoid' for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Use 'binary_crossentropy' for binary

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Neural Network Accuracy: {accuracy:.2f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.89


In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Türkçe için metin ön işleme adımları
def preprocess_text_turkish(text):
    text = text.lower()  # Küçük harfe dönüştürme
    text = text.translate(str.maketrans('', '', string.punctuation))  # Noktalama işaretlerini kaldırma
    tokens = word_tokenize(text, language='turkish')  # Tokenleme için Türkçe
    stop_words = set(stopwords.words('turkish'))  # Türkçe stop-word'leri alma
    tokens = [word for word in tokens if word not in stop_words]  # Stop-word'leri kaldırma
    stemmer = TurkishStemmer()  # Türkçe kök çıkarma
    tokens = [stemmer.stemWord(word) for word in tokens]  # Kök çıkarma
    return ' '.join(tokens)

# Load the dataset
file_path = 'Data.xlsx'
data = pd.read_excel(file_path, sheet_name='Data')

# Preprocess the dataset
data['Clean_Comment_Turkish'] = data['Comment'].apply(preprocess_text_turkish)

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Clean_Comment_Turkish'])
X_seq = tokenizer.texts_to_sequences(data['Clean_Comment_Turkish'])
X_padded = pad_sequences(X_seq, padding='post', maxlen=50)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Topic'])
y_categorical = to_categorical(y_encoded)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

# Neural Network Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=50),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_categorical.shape[1], activation='softmax')  # Use 'sigmoid' for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Use 'binary_crossentropy' for binary

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# User input for prediction
new_comment = input("Enter a new comment: ")

# Preprocess the new comment
new_comment_clean = preprocess_text_turkish(new_comment)

# Tokenize and pad the new comment
new_comment_seq = tokenizer.texts_to_sequences([new_comment_clean])
new_comment_padded = pad_sequences(new_comment_seq, padding='post', maxlen=50)

# Predict with the model
probabilities = model.predict(new_comment_padded)[0]

# Display probabilities for each category
all_categories = label_encoder.classes_
for i, category in enumerate(all_categories):
    print(f"{category}: {probabilities[i]*100:.2f}%")

# Find the most likely category
most_likely_category = all_categories[np.argmax(probabilities)]
print(f"The probable category of the comment: {most_likely_category}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Enter a new comment: ürünün sesi çok fazla çıkmıyordu ve ebatı da çok küçültü iade ettim
ağırlık: 0.14%
hediye: 0.00%
kalite: 0.06%
kalıp/boy/ölçü: 23.01%
kullanımı kolay: 0.01%
renk: 15.84%
ses: 60.94%
çeyiz: 0.00%
The probable category of the comment: ses


In [38]:
import pandas as pd
import os
import contextlib
from openpyxl import load_workbook
from tensorflow.keras.preprocessing.sequence import pad_sequences

# İlk attığınız kodda tanımlanan modeli ve diğer gerekli nesneleri yükleme
# model, tokenizer, label_encoder gibi nesnelerin tanımlandığı varsayılıyor

# Input ve Output path'lerini belirleme
input_path = 'amazon_reviews.xlsx'
output_path = 'amazon_reviews_categorized.xlsx'

# Input Excel dosyasını okuma (başlık yoksa)
data = pd.read_excel(input_path, header=None)

# Yorumların bulunduğu sütunun indeksi (örneğin, ilk sütun için 0)
comment_column_index = 0

# Yorumları modelle etiketleme
def label_comment(comment):
    new_comment_clean = preprocess_text_turkish(comment)
    new_comment_seq = tokenizer.texts_to_sequences([new_comment_clean])
    new_comment_padded = pad_sequences(new_comment_seq, padding='post', maxlen=50)
    probabilities = model.predict(new_comment_padded)[0]

    # Olasılığı %35'un üstünde olan kategorileri bulma
    threshold = 0.35
    likely_categories = [label_encoder.classes_[i] for i, prob in enumerate(probabilities) if prob > threshold]

    # Kategorilere ait olasılıkları alarak yüzdeye çevirme
    category_probabilities = [f"{prob * 100:.2f}%" for i, prob in enumerate(probabilities) if label_encoder.classes_[i] in likely_categories]

    return likely_categories, category_probabilities

# Her yorum için etiket ekleme
data['Predicted_Topics'], data['Probabilities'] = zip(*data[comment_column_index].apply(label_comment))

# Excel'e yazdırma işlemi
with open(os.devnull, 'w') as nullfile:
    with contextlib.redirect_stdout(nullfile):
        writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
        for label in data['Predicted_Topics'].explode().unique():
            if isinstance(label, str):  # Check if label is already a string
                cleaned_label = label.replace("/", "_")  # Replace "/" with "_"
            else:
                cleaned_label = str(label)  # Convert non-string types to strings and then replace
                cleaned_label = cleaned_label.replace("/", "_")  # Replace "/" with "_"

            labeled_data = data[data['Predicted_Topics'].apply(lambda x: label in x)]
            labeled_data.to_excel(writer, sheet_name=cleaned_label, index=False, columns=[comment_column_index])

        # 'All' kısmında Comments ve Predicted Topics kolonları olsun
        all_data = data.explode('Predicted_Topics')[[comment_column_index, 'Predicted_Topics', 'Probabilities']]
        all_data.to_excel(writer, sheet_name='All', index=False, header=['Comments', 'Predicted Topics', 'Probabilities'])

        # Excel dosyasını kaydetme ve kapatma
        writer.save()

print(f"{output_path} dosyası başarıyla oluşturuldu ve kaydedildi.")


amazon_reviews_categorized.xlsx dosyası başarıyla oluşturuldu ve kaydedildi.


  writer.save()
