In [1]:
import pandas as pd
import re, string
import nltk
import torch
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset

# Sadece gerekli olan NLTK kaynaklarını indiriyoruz
nltk.download('stopwords')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sagla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sagla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

# CSV dosyasını yükle
df = pd.read_csv("sentiment140_dataset.csv", header=None, encoding='latin-1')

# Orijinal sütunlara geçici isimler verelim
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'review']

# Sadece sentiment ve review sütunlarını al
df = df[['sentiment', 'review']]

# sentiment sütununu string'den integer'a çevir
df['sentiment'] = df['sentiment'].astype(int)

# İlk 5 satırı kontrol et
print(df.head())


   sentiment                                             review
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          0  is upset that he can't update his Facebook by ...
2          0  @Kenichan I dived many times for the ball. Man...
3          0    my whole body feels itchy and like its on fire 
4          0  @nationwideclass no, it's not behaving at all....


In [3]:
import re

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [11]:
import re
import string

def clean_text_basic(text):
    # Küçük harf yap
    text = text.lower()
    
    # HTML etiketlerini kaldır
    text = re.sub(r'<.*?>', '', text)
    
    # URL'leri kaldır
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # 🔹 Mentionları kaldır (@username)
    text = re.sub(r'@\w+', '', text)
    
    # Noktalama işaretlerini kaldır
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Emoji temizleme
    emoji_pattern = re.compile(
        "[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0" "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Fazla boşlukları temizle
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# POS bilgisi olmadan varsayılan isim (noun) kabul ederek sade lemmatization
def lemmatize_simple(text):
    words = text.split()  # burada tokenizer yerine split kullanıyoruz
    cleaned = [
        lemmatizer.lemmatize(word) for word in words
        if word.lower() not in stop_words and word.isalpha()
    ]
    return ' '.join(cleaned)

# Full temizlik + lemmatizasyon
def full_clean(text):
    text = clean_text_basic(text)
    text = lemmatize_simple(text)
    return text

# Uygula
df['cleaned_review'] = df['review'].apply(full_clean)

In [13]:
slang_dict = {"omg": "oh my god", "lol": "laugh out loud", "u": "you", "r": "are", "gr8": "great", "b4": "before", "asap": "as soon as possible", "btw": "", "fyi": "for your information", "idk": "i don't know", "imo": "in my opinion", "jk": "just kidding", "lmao": "laughing my ass off", "lmk": "let me know", "nvm": "nevermind", "np": "no problem", "rofl": "rolling on the floor laughing", "smh": "shaking my head", "tbh": "to be honest", "thx": "thanks", "ty": "thank you", "wth": "what the heck", "wtf": "what the f***", "yolo": "you only live once", "brb": "be right back", "gtg": "got to go", "btw": "by the way", "ttyl": "talk to you later", "ily": "I love you", "g2g": "got to go", "h8": "hate", "jk": "just kidding", "thx": "thanks", "ttyl": "talk to you later", "cya": "see you", "gg": "good game", "afk": "away from keyboard", "ez": "easy", "wb": "welcome back", "idc": "i don't care", "rn": "right now", "lmk": "let me know", "ikr": "i know right", "tmi": "too much information", "smh": "shaking my head", "w/e": "whatever", 
                 "bff": "best friends forever", "tfw": "that feeling when", "np": "no problem", "nvm": "nevermind", "fyi": "for your information", "cuz": "because", "gonna": "going to", "wanna": "want to", "gotta": "got to", "kinda": "kind of", "sorta": "sort of", "dunno": "don't know", "ain't": "is not", "gimme": "give me", "lemme": "let me", "im": "I'm", "hes": "he's", "shes": "she's", "theyre": "they're", "youre": "you're", "havent": "have not", "cant": "cannot",
                  "couldnt": "could not", "didnt": "did not", "doesnt": "does not", "dont": "do not", "isnt": "is not", "mightnt": "might not", "mustnt": "must not", "shouldnt": "should not", "wasnt": "was not", "werent": "were not", "wouldnt": "would not", "tho": "though", "thru": "through", "nite": "night", "neva": "never", "sum": "some", "dat": "that", "dis": "this", "dem": "them", "dey": "they", "em": "them", "whered": "where did", "whod": "who did", "whos": "who's", "yall": "you all", "dam": "damn", "hell": "hell", "pissed": "pissed", "fck": "fuck", "fuk": "fuck", "effing": "effing", "bs": "bullshit", "crap": "crap", "shitty": "shitty", "wtf": "what the fuck", "stfu": "shut the fuck up", "gtfo": "get the fuck out", "irl": "in real life", "roflmao": "rolling on the floor laughing my ass off", "smol": "small", "big oof": "huge mistake", "yeet": "throw something forcefully", "pog": "play of the game", "sus": "suspicious", "cap": "lie", "no cap": "no lie", "bet": "okay", "lit": "amazing", "fr": "for real", "bruh": "bro", "fam": "family",
                    "goat": "greatest of all time", "lowkey": "somewhat", "highkey": "very", "vibe": "mood", "drip": "fashionable", "slay": "do something well", "tea": "gossip", "sksksk": "excited reaction", "tf": "the fuck", "fomo": "fear of missing out", "tldr": "too long didn't read", "hmu": "hit me up", "wyd": "what you doing", "wym": "what you mean", "wdym": "what do you mean", "btw": "by the way", "imo": "in my opinion", "idc": "i don't care", 
                  "idgaf": "i don't give a f***", "frfr": "for real for real", "mf": "motherfucker", "rn": "right now", "lolz": "laughs", "hmu": "hit me up", "hbu": "how about you", "rn": "right now", "lmk": "let me know", "ikr": "i know right", "jk": "just kidding", "fr": "for real", "ffs": "for fuck's sake", "btw": "by the way", "nfs": "not for sale", "dm": "direct message", "af": "as fuck", "idk": "i don't know"}

def correct_slangs(text, slang_dict):
    for slang, replacement in slang_dict.items():
        text = re.sub(rf'\b{slang}\b', replacement, text)
    return text

# Slang düzeltmeyi orijinal review'e uygula (henüz temizlenmemiş)
df['cleaned_review'] = df['cleaned_review'].apply(lambda x: correct_slangs(x.lower(), slang_dict))

In [4]:
# Etiketleri sayıya çevir: 0 → negatif, 4 → pozitif
df['label'] = df['sentiment'].map({0: 0, 4: 1})

# NaN olanları temizle (her ihtimale karşı)
df = df.dropna(subset=['cleaned_review', 'label'])

In [None]:
df.to_csv("cleaned_sentiment140.csv")

NameError: name 'df' is not defined

In [4]:
import pandas as pd
df=pd.read_csv("cleaned_sentiment140.csv")

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

# Özellik ve etiketleri ayır
X = df['cleaned_review'].values
y = df['label'].values

# %80 train, %20 test
x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Kontrol
print("Veri şekilleri:")
print(f"Train: {x_train.shape}, Test: {x_test.shape}")


Veri şekilleri:
Train: (1280000,), Test: (320000,)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score

# TfidfVectorizer (Binary + max 5000 feature)
vectorizer = TfidfVectorizer(max_features=5000)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

# Model eğitme ve değerlendirme fonksiyonu
def train_and_evaluate_model(model, model_name):
    print(f"\n🔹 {model_name}")
    model.fit(x_train_vec, y_train)
    preds = model.predict(x_test_vec)
    # Skorlar
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    

In [7]:
train_and_evaluate_model(LogisticRegression(max_iter=1000), "Logistic Regression")


🔹 Logistic Regression
Accuracy: 0.7903
F1 Score: 0.7936


In [None]:
train_and_evaluate_model(KNeighborsClassifier(n_neighbors=3), "K-Nearest Neighbors")

In [None]:
# Random Forest
train_and_evaluate_model(RandomForestClassifier(n_estimators=100), "Random Forest")

In [None]:
# Multinomial Naive Bayes
train_and_evaluate_model(MultinomialNB(), "Multinomial Naive Bayes")

In [7]:
# Linear Support Vector Classifier (SVC'den çok daha hızlıdır)
from sklearn.svm import LinearSVC
train_and_evaluate_model(LinearSVC(), "Linear SVC")


🔹 Linear SVC
Accuracy: 0.7898
F1 Score: 0.7938


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenizer ayarları
max_words = 10000     # En sık kullanılan 10.000 kelime
max_len = 200         # Cümle uzunluğu sabitlenecek

# Tokenizer eğitimi
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

# Metinleri dizilere çevir
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad (doldurma)
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='post', truncating='post')

# Etiketleri array yap
y_train_pad = np.array(y_train)
y_test_pad = np.array(y_test)

In [None]:
import tensorflow as tf

def create_model(model_type='rnn', 
                        embedding_dim=64, 
                        hidden_dim=64, 
                        dropout_rate=0.2):
    """
    Basit derin öğrenme modelleri oluşturmak için esnek fonksiyon.
    -------------------------------------------------------------
    model_type : 'rnn', 'lstm', 'gru', 'cnn'
    embedding_dim : Embedding katmanındaki vektör boyutu
    hidden_dim : RNN/GRU/LSTM katmanlarındaki gizli boyut 
                 veya CNN Conv1D 'filters' sayısı
    dropout_rate : Katman sonlarında Dropout oranı
    """

    model = tf.keras.Sequential()

    # Embedding katmanı (tüm modeller için ortak)
    model.add(tf.keras.layers.Embedding(
        input_dim=max_words,     # Global değişken, örn. 10000
        output_dim=embedding_dim,
        input_length=max_len     # Global değişken, örn. 200
    ))

    # Tek katmanlı RNN/LSTM/GRU/CNN
    if model_type == 'rnn':
        model.add(tf.keras.layers.SimpleRNN(hidden_dim, dropout=dropout_rate))
    elif model_type == 'lstm':
        model.add(tf.keras.layers.LSTM(hidden_dim, dropout=dropout_rate))
    elif model_type == 'gru':
        model.add(tf.keras.layers.GRU(hidden_dim, dropout=dropout_rate))
    elif model_type == 'cnn':
        model.add(tf.keras.layers.Conv1D(
            filters=hidden_dim,
            kernel_size=3,
            activation='relu'
        ))
        model.add(tf.keras.layers.GlobalMaxPooling1D())
    else:
        raise ValueError("model_type must be one of: 'rnn', 'lstm', 'gru', 'cnn'")

    # Opsiyonel Dropout (son katmandan sonra)
    model.add(tf.keras.layers.Dropout(dropout_rate))

    # Tek Dense katmanı
    model.add(tf.keras.layers.Dense(32, activation='relu'))
    # Çıkış
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # Model derleme
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [12]:
import tensorflow as tf
from sklearn.metrics import classification_report

def train_and_evaluate(model_type):
    print(f"\n📚 Training {model_type.upper()} model...")

    # Modeli oluştur
    model = create_model(model_type)

    # En iyi modeli kaydetmek için callback
    # 'val_accuracy' metrikte en iyi (maksimum) değeri gördüğünde kaydeder
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=f'best_{model_type}.keras',  # Dosya adı
        monitor='val_accuracy',           # izlenecek metrik
        mode='max',                           # en yüksek metrik değerini seç
        save_best_only=True,             # sadece en iyi modeli sakla
        verbose=1
    )

    # Eğitim
    history = model.fit(
        x_train_pad, y_train,
        epochs=5,
        batch_size=128,
        validation_split=0.2,
        verbose=1,
        callbacks=[checkpoint_callback]   # callback listesi
    )

    # Eğitim bittiğinde 'best_{model_type}.h5' dosyasında en iyi epoch modeli var
    # Şimdi o en iyi modeli yükleyelim
    best_model = tf.keras.models.load_model(f'best_{model_type}.keras')

    # Test verisinde tahmin
    y_pred = (best_model.predict(x_test_pad) > 0.5).astype("int32")

    # Rapor
    print(f"\n📊 {model_type.upper()} Classification Report (Best Epoch):")
    print(classification_report(y_test, y_pred))


In [None]:
for m in ['cnn', 'lstm','gru']:
    train_and_evaluate(m)


📚 Training CNN model...
Epoch 1/5
[1m7962/7962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.7592 - loss: 0.4887
Epoch 1: val_accuracy improved from -inf to 0.79129, saving model to best_cnn.keras
[1m7962/7962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 55ms/step - accuracy: 0.7592 - loss: 0.4887 - val_accuracy: 0.7913 - val_loss: 0.4444
Epoch 2/5
[1m7961/7962[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - accuracy: 0.8016 - loss: 0.4266
Epoch 2: val_accuracy improved from 0.79129 to 0.79489, saving model to best_cnn.keras
[1m7962/7962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m367s[0m 46ms/step - accuracy: 0.8016 - loss: 0.4266 - val_accuracy: 0.7949 - val_loss: 0.4379
Epoch 3/5
[1m7961/7962[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - accuracy: 0.8148 - loss: 0.4028
Epoch 3: val_accuracy did not improve from 0.79489
[1m7962/7962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 46ms/



Epoch 1/5
[1m3651/7962[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m42:56[0m 598ms/step - accuracy: 0.4993 - loss: 0.6933