In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


2- Data Cleaning: Perform standard text preprocessing tasks, including: Removing stop words, punctuation, and special
characters, Lowercasing the text, Tokenizing the reviews, Stemming or lemmatization.

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [4]:
# ALREADY DEFINED IN TASK 8
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label='positive')
    recall = recall_score(true_labels, predicted_labels, pos_label='positive')
    f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [8]:
#TF-IDF PART 1

# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=5000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram.shape)



# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, clf.predict(X_test))


TF-IDF Unigram Feature Shape: (50000, 5000)
TF-IDF Bigram Feature Shape: (50000, 5000)
TF-IDF Trigram Feature Shape: (50000, 5000)
Logistic Regression with TF-IDF using unigrams with 5000 features:
Accuracy: 0.89
Precision: 0.88
Recall: 0.90
F1-Score: 0.89
Logistic Regression with TF-IDF using bigrams with 5000 features:
Accuracy: 0.89
Precision: 0.88
Recall: 0.90
F1-Score: 0.89
Logistic Regression with TF-IDF using trigrams with 5000 features:
Accuracy: 0.89
Precision: 0.88
Recall: 0.90
F1-Score: 0.89


In [9]:
#TF-IDF PART 2

# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=2500)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=2500, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=2500, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram.shape)



# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 2500 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 2500 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 2500 features:")
evaluate_model(y_test, clf.predict(X_test))

TF-IDF Unigram Feature Shape: (50000, 2500)
TF-IDF Bigram Feature Shape: (50000, 2500)
TF-IDF Trigram Feature Shape: (50000, 2500)
Logistic Regression with TF-IDF using unigrams with 2500 features:
Accuracy: 0.88
Precision: 0.88
Recall: 0.90
F1-Score: 0.89
Logistic Regression with TF-IDF using bigrams with 2500 features:
Accuracy: 0.88
Precision: 0.87
Recall: 0.89
F1-Score: 0.88
Logistic Regression with TF-IDF using trigrams with 2500 features:
Accuracy: 0.88
Precision: 0.88
Recall: 0.90
F1-Score: 0.89


In [10]:
#TF-IDF PART 3

# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=10000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram.shape)



# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 10000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 10000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 10000 features:")
evaluate_model(y_test, clf.predict(X_test))


TF-IDF Unigram Feature Shape: (50000, 10000)
TF-IDF Bigram Feature Shape: (50000, 10000)
TF-IDF Trigram Feature Shape: (50000, 10000)
Logistic Regression with TF-IDF using unigrams with 10000 features:
Accuracy: 0.89
Precision: 0.89
Recall: 0.91
F1-Score: 0.90
Logistic Regression with TF-IDF using bigrams with 10000 features:
Accuracy: 0.89
Precision: 0.88
Recall: 0.91
F1-Score: 0.90
Logistic Regression with TF-IDF using trigrams with 10000 features:
Accuracy: 0.89
Precision: 0.88
Recall: 0.91
F1-Score: 0.90


In [11]:
#TF-IDF PART 3

# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=1000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=1000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram.shape)



# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 1000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 1000 features:")
evaluate_model(y_test, clf.predict(X_test))


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 1000 features:")
evaluate_model(y_test, clf.predict(X_test))


TF-IDF Unigram Feature Shape: (50000, 1000)
TF-IDF Bigram Feature Shape: (50000, 1000)
TF-IDF Trigram Feature Shape: (50000, 1000)
Logistic Regression with TF-IDF using unigrams with 1000 features:
Accuracy: 0.86
Precision: 0.86
Recall: 0.88
F1-Score: 0.87
Logistic Regression with TF-IDF using bigrams with 1000 features:
Accuracy: 0.86
Precision: 0.85
Recall: 0.88
F1-Score: 0.87
Logistic Regression with TF-IDF using trigrams with 1000 features:
Accuracy: 0.86
Precision: 0.85
Recall: 0.88
F1-Score: 0.87


In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.5, random_state=42)
# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with tf-idf:")
evaluate_model(y_test, clf_tfidf.predict(X_test))

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment'], test_size=0.2, random_state=42)
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 10000 features:")
evaluate_model(y_test, clf.predict(X_test))
