In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

nltk.data.path.append('C:\\Users\\Fatih\\Desktop\\EDU\\NLP\\NLTK')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


In [None]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

2- Data Cleaning: Perform standard text preprocessing tasks, including: Removing stop words, punctuation, and special
characters, Lowercasing the text, Tokenizing the reviews, Stemming or lemmatization.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [None]:
# Convert sentiment to numeric values: 1 for positive, 0 for negative
df['sentiment_numeric'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
# ALREADY DEFINED IN TASK 8
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label='positive')
    recall = recall_score(true_labels, predicted_labels, pos_label='positive')
    f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# LSTM deneme for BoW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)


# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
#lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=128)

print("LSTM with BoW:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))



In [None]:
#BOW PART 1

from sklearn.feature_extraction.text import CountVectorizer

# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=3000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("BoW Unigram Feature Shape:", X_bow_unigram.shape)
print("BoW Bigram Feature Shape:", X_bow_bigram.shape)
print("BoW Trigram Feature Shape:", X_bow_trigram.shape)


# For UniGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using unigrams with 3000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For BiGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using bigrams with 3000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For TriGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using trigrams with 3000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################


In [None]:
#BOW PART 2

# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=5000)
#X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review'])  # toarray() olmadan


# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=5000, ngram_range=(1, 2))
#X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=5000, ngram_range=(1, 3))
#X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("BoW Unigram Feature Shape:", X_bow_unigram.shape)
print("BoW Bigram Feature Shape:", X_bow_bigram.shape)
print("BoW Trigram Feature Shape:", X_bow_trigram.shape)


# For UniGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using unigrams with 5000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For BiGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using bigrams with 5000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For TriGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using trigrams with 5000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################


In [None]:
#BOW PART 3

from sklearn.feature_extraction.text import CountVectorizer

# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=7000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=7000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=7000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("BoW Unigram Feature Shape:", X_bow_unigram.shape)
print("BoW Bigram Feature Shape:", X_bow_bigram.shape)
print("BoW Trigram Feature Shape:", X_bow_trigram.shape)


# For UniGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using unigrams with 7000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For BiGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using bigrams with 7000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################

# For TriGram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment'], test_size=0.5, random_state=42)

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))
"""
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with BoW using trigrams with 7000 features:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))
"""
#######################################################################################################################################



