In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import Word2Vec

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


2- Data Cleaning: Perform standard text preprocessing tasks, including: Removing stop words, punctuation, and special
characters, Lowercasing the text, Tokenizing the reviews, Stemming or lemmatization.

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [8]:
# ALREADY DEFINED IN TASK 7

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all reviews
df['tokens'] = df['cleaned_review'].apply(tokenize_text)

In [9]:
# ALREADY DEFINED IN TASK 8

# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label='positive')
    recall = recall_score(true_labels, predicted_labels, pos_label='positive')
    f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [None]:
#WORD2VEC PART 1 - TUNING WINDOWS SIZE

# Pencere boyutu 3
word2vec_model_window_3 = Word2Vec(sentences=df['tokens'], vector_size=100, window=3, min_count=5, workers=4, epochs=10)
X_word2vec_window_3 = np.array([np.mean([word2vec_model_window_3.wv[word] for word in words if word in word2vec_model_window_3.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with window size 3:", X_word2vec_window_3.shape)

# Pencere boyutu 5 (varsayılan)
word2vec_model_window_5 = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec_window_5 = np.array([np.mean([word2vec_model_window_5.wv[word] for word in words if word in word2vec_model_window_5.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with window size 5:", X_word2vec_window_5.shape)

# Pencere boyutu 7
word2vec_model_window_7 = Word2Vec(sentences=df['tokens'], vector_size=100, window=7, min_count=5, workers=4, epochs=10)
X_word2vec_window_7 = np.array([np.mean([word2vec_model_window_7.wv[word] for word in words if word in word2vec_model_window_7.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with window size 7:", X_word2vec_window_7.shape)


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_window_3, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with window size 3")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_window_5, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with window size 5")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_window_7, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with window size 7")
evaluate_model(y_test, clf_word2vec.predict(X_test))


Word2Vec Feature Shape with window size 3: (50000, 100)
Word2Vec Feature Shape with window size 5: (50000, 100)
Word2Vec Feature Shape with window size 7: (50000, 100)
Logistic Regression model with word2vec with window size 3
Accuracy: 0.86
Precision: 0.86
Recall: 0.87
F1-Score: 0.87
Logistic Regression model with word2vec with window size 5
Accuracy: 0.87
Precision: 0.87
Recall: 0.88
F1-Score: 0.87
Logistic Regression model with word2vec with window size 7
Accuracy: 0.87
Precision: 0.87
Recall: 0.88
F1-Score: 0.87


'"\n# Train Word2Vec model\nword2vec_model = Word2Vec(sentences=df[\'tokens\'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)\nX_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df[\'tokens\']])\n\n\n# Check Word2Vec features\nprint("Word2Vec Feature Shape:", X_word2vec.shape)\n\n# Split data into training and testing\nX_train, X_test, y_train, y_test = train_test_split(X_word2vec, df[\'sentiment\'], test_size=0.2, random_state=42)\n\n# Train a Logistic Regression model\nclf_word2vec = LogisticRegression(max_iter=1000)\nclf_word2vec.fit(X_train, y_train)\nprint("Logistic Regression model with word2vec")\nevaluate_model(y_test, clf_word2vec.predict(X_test))\n'

In [14]:
#WORD2VEC PART 2 - TUNING EMBEDDING DIMENTSION

# Gömme boyutu 50
word2vec_model_dim_50 = Word2Vec(sentences=df['tokens'], vector_size=50, window=5, min_count=5, workers=4, epochs=10)
X_word2vec_dim_50 = np.array([np.mean([word2vec_model_dim_50.wv[word] for word in words if word in word2vec_model_dim_50.wv] or [np.zeros(50)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with embedding dimension 50:", X_word2vec_dim_50.shape)

# Gömme boyutu 100 (varsayılan)
word2vec_model_dim_100 = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec_dim_100 = np.array([np.mean([word2vec_model_dim_100.wv[word] for word in words if word in word2vec_model_dim_100.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with embedding dimension 100:", X_word2vec_dim_100.shape)

# Gömme boyutu 200
word2vec_model_dim_200 = Word2Vec(sentences=df['tokens'], vector_size=200, window=5, min_count=5, workers=4, epochs=10)
X_word2vec_dim_200 = np.array([np.mean([word2vec_model_dim_200.wv[word] for word in words if word in word2vec_model_dim_200.wv] or [np.zeros(200)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with embedding dimension 200:", X_word2vec_dim_200.shape)


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_dim_50, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with embedding dimension 50")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_dim_100, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with embedding dimension 100")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_dim_200, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with embedding dimension 200")
evaluate_model(y_test, clf_word2vec.predict(X_test))



Word2Vec Feature Shape with embedding dimension 50: (50000, 50)
Word2Vec Feature Shape with embedding dimension 100: (50000, 100)
Word2Vec Feature Shape with embedding dimension 200: (50000, 200)
Logistic Regression model with word2vec with embedding dimension 50
Accuracy: 0.85
Precision: 0.85
Recall: 0.86
F1-Score: 0.86
Logistic Regression model with word2vec with embedding dimension 100
Accuracy: 0.87
Precision: 0.86
Recall: 0.88
F1-Score: 0.87
Logistic Regression model with word2vec with embedding dimension 200
Accuracy: 0.87
Precision: 0.87
Recall: 0.88
F1-Score: 0.88


In [15]:
#WORD2VEC PART 3 - TUNING TRAINING EPOCHS

# Eğitim epoch'ları 5
word2vec_model_epochs_5 = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=5)
X_word2vec_epochs_5 = np.array([np.mean([word2vec_model_epochs_5.wv[word] for word in words if word in word2vec_model_epochs_5.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with 5 epochs:", X_word2vec_epochs_5.shape)

# Eğitim epoch'ları 10 (varsayılan)
word2vec_model_epochs_10 = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec_epochs_10 = np.array([np.mean([word2vec_model_epochs_10.wv[word] for word in words if word in word2vec_model_epochs_10.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with 10 epochs:", X_word2vec_epochs_10.shape)

# Eğitim epoch'ları 20
word2vec_model_epochs_20 = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=20)
X_word2vec_epochs_20 = np.array([np.mean([word2vec_model_epochs_20.wv[word] for word in words if word in word2vec_model_epochs_20.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Feature Shape with 20 epochs:", X_word2vec_epochs_20.shape)


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_epochs_5, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with 5 epochs")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_epochs_10, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with 10 epochs")
evaluate_model(y_test, clf_word2vec.predict(X_test))

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_epochs_20, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec with 20 epochs")
evaluate_model(y_test, clf_word2vec.predict(X_test))

Word2Vec Feature Shape with 5 epochs: (50000, 100)
Word2Vec Feature Shape with 10 epochs: (50000, 100)
Word2Vec Feature Shape with 20 epochs: (50000, 100)
Logistic Regression model with word2vec with 5 epochs
Accuracy: 0.86
Precision: 0.86
Recall: 0.87
F1-Score: 0.86
Logistic Regression model with word2vec with 10 epochs
Accuracy: 0.87
Precision: 0.87
Recall: 0.87
F1-Score: 0.87
Logistic Regression model with word2vec with 20 epochs
Accuracy: 0.87
Precision: 0.87
Recall: 0.88
F1-Score: 0.87
