In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

nltk.data.path.append('C:\\Users\\Fatih\\Desktop\\EDU\\NLP\\NLTK')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()



In [None]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)


In [None]:
# Convert sentiment to numeric values: 1 for positive, 0 for negative
df['sentiment_numeric'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

def evaluate_model_lstm(true_labels, predicted_probs, threshold=0.5):
    # Convert predicted probabilities to binary labels
    predicted_labels = (predicted_probs >= threshold).astype(int)
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label=1)
    recall = recall_score(true_labels, predicted_labels, pos_label=1)
    f1 = f1_score(true_labels, predicted_labels, pos_label=1)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    


In [None]:
# GHC SUGGESTION Bow Unigram 3000
from sklearn.metrics import accuracy_score, precision_score, recall_score


# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_unigram = X_bow_unigram.todense()

X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.0))  # Set recurrent_dropout to 0
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=128)

print("LSTM with BoW using unigrams with 3000 features:")
predicted_probs = lstm_model_bow.predict(X_test)
evaluate_model_lstm(y_test, predicted_probs)

In [None]:
# GHC SUGGESTION Bow Bigram 3000
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_bigram = X_bow_bigram.todense()

X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.0))  # Set recurrent_dropout to 0
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=128)

print("LSTM with BoW using bigrams with 3000 features:")
predicted_probs = lstm_model_bow.predict(X_test)
evaluate_model_lstm(y_test, predicted_probs)

In [None]:
# GHC SUGGESTION Bow Bigram 3000 - copy

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Convert sparse matrix to dense matrix
X_bow_bigram = X_bow_bigram.todense()

X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1]))
lstm_model_bow.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.0))  # Set recurrent_dropout to 0
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=128)

print("LSTM with BoW using bigrams with 3000 features:")
predicted_probs = lstm_model_bow.predict(X_test)
evaluate_model_lstm(y_test, predicted_probs)

In [None]:
# GHC SUGGESTION Bow Trigram 3000 
from sklearn.metrics import accuracy_score, precision_score, recall_score
  

# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=3000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review'])

X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

# Convert sparse matrix to dense matrix
X_bow_bigram = X_bow_bigram.todense()

# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.0))  # Set recurrent_dropout to 0
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=128)

print("LSTM with BoW using trigrams with 3000 features:")
predicted_probs = lstm_model_bow.predict(X_test)
evaluate_model_lstm(y_test, predicted_probs)