In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Keaton\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


2- Data Cleaning: Perform standard text preprocessing tasks, including: Removing stop words, punctuation, and special
characters, Lowercasing the text, Tokenizing the reviews, Stemming or lemmatization.

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [4]:
# ALREADY DEFINED IN TASK 8
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label='positive')
    recall = recall_score(true_labels, predicted_labels, pos_label='positive')
    f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [5]:
#TF-IDF PART 1

# Unigram (varsayılan)
vectorizer_tfidf_unigram_3000_feat = TfidfVectorizer(max_features=3000)
X_tfidf_unigram_3000_feat = vectorizer_tfidf_unigram_3000_feat.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram_3000_feat = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf_bigram_3000_feat = vectorizer_tfidf_bigram_3000_feat.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram_3000_feat = TfidfVectorizer(max_features=3000, ngram_range=(1, 3))
X_tfidf_trigram_3000_feat = vectorizer_tfidf_trigram_3000_feat.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram_3000_feat.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram_3000_feat.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram_3000_feat.shape)


# For Unigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram_3000_feat, df['sentiment'], test_size=0.2, random_state=42)


TF-IDF Unigram Feature Shape: (50000, 3000)
TF-IDF Bigram Feature Shape: (50000, 3000)
TF-IDF Trigram Feature Shape: (50000, 3000)


In [None]:
# LSTM deneme

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model_lstm(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label=1)
    recall = recall_score(true_labels, predicted_labels, pos_label=1)
    f1 = f1_score(true_labels, predicted_labels, pos_label=1)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Tokenize the text data
tokenizer = Tokenizer(num_words=3000)  # Set a limit of 3000 most common words
tokenizer.fit_on_texts(df['cleaned_review'])

# Convert text to sequences
X_sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
X_padded = pad_sequences(X_sequences, maxlen=100)  # Padding sequences to a max length of 100

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, df['sentiment'], test_size=0.2, random_state=42)

# Label encode the target variable (sentiment)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # "negative" -> 0, "positive" -> 1
y_test_encoded = label_encoder.transform(y_test)

# LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=3000, output_dim=100, input_length=100))  # Embedding layer
lstm_model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))          # LSTM layer
lstm_model.add(Dense(1, activation='sigmoid'))                               # Output layer for binary classification
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train, y_train_encoded, epochs=5, batch_size=32)

# Tahminleri ikili değerlere (0 veya 1) dönüştürme
predicted_labels = (lstm_model.predict(X_test) > 0.5).astype("int32")

# Evaluate the model
print("LSTM with integer-encoded text sequences:")
evaluate_model_lstm(y_test_encoded, predicted_labels)


Epoch 1/5




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 95ms/step - accuracy: 0.7769 - loss: 0.4672
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 115ms/step - accuracy: 0.8737 - loss: 0.3083
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 105ms/step - accuracy: 0.8902 - loss: 0.2713
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 108ms/step - accuracy: 0.9065 - loss: 0.2358
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 112ms/step - accuracy: 0.9169 - loss: 0.2143
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 45ms/step
LSTM with integer-encoded text sequences:
Accuracy: 0.8726
Precision: 0.8881
Recall: 0.8549
F1 Score: 0.8712


In [None]:

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Bigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram_3000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Trigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram_3000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################


In [None]:
#TF-IDF PART 2

# Unigram (varsayılan)
vectorizer_tfidf_unigram_5000_feat = TfidfVectorizer(max_features=5000)
X_tfidf_unigram_5000_feat = vectorizer_tfidf_unigram_5000_feat.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram_5000_feat = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf_bigram_5000_feat = vectorizer_tfidf_bigram_5000_feat.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram_5000_feat = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf_trigram_5000_feat = vectorizer_tfidf_trigram_5000_feat.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram_5000_feat.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram_5000_feat.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram_5000_feat.shape)



# For Unigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram_5000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Bigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram_5000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Trigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram_5000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################



In [10]:
#TF-IDF PART 3

# Unigram (varsayılan)
vectorizer_tfidf_unigram_7000_feat = TfidfVectorizer(max_features=7000)
X_tfidf_unigram_7000_feat = vectorizer_tfidf_unigram_7000_feat.fit_transform(df['cleaned_review'])

# Bigram
vectorizer_tfidf_bigram_7000_feat = TfidfVectorizer(max_features=7000, ngram_range=(1, 2))
X_tfidf_bigram_7000_feat = vectorizer_tfidf_bigram_7000_feat.fit_transform(df['cleaned_review'])

# Trigram
vectorizer_tfidf_trigram_7000_feat = TfidfVectorizer(max_features=7000, ngram_range=(1, 3))
X_tfidf_trigram_7000_feat = vectorizer_tfidf_trigram_7000_feat.fit_transform(df['cleaned_review'])

# Özellik şekillerini kontrol etme
print("TF-IDF Unigram Feature Shape:", X_tfidf_unigram_7000_feat.shape)
print("TF-IDF Bigram Feature Shape:", X_tfidf_bigram_7000_feat.shape)
print("TF-IDF Trigram Feature Shape:", X_tfidf_trigram_7000_feat.shape)

# For Unigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram_7000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Bigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram_7000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################

# For Trigram
#######################################################################################################################################
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram_7000_feat, df['sentiment'], test_size=0.2, random_state=42)

# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, clf_tfidf.predict(X_test))
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, svm_tfidf.predict(X_test))
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

""""
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))
lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)
print("LSTM with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))
"""
#######################################################################################################################################
