# Fine-Tuning NLP Feature Extraction Techniques
 
11- In this task, you will focus on optimizing the NLP feature extraction
methods used in the sentiment classification models. Begin by exploring different configurations of the BoW, TF-IDF
techniques, such as adjusting the n-gram range, using unigrams, bigrams, or trigrams, and varying the maximum number of
features to include in the vocabulary. For word embeddings, experiment with fine-tuning Word2Vec, GloVe, or BERT by
adjusting parameters like the window size, embedding dimension, and training epochs for Word2Vec, or by applying BERT
fine-tuning with different learning rates and batch sizes. Once you havve fine-tuned these feature extraction techniques, apply
them to the training dataset and evaluate how these changes impact the model's performance on the test dataset. You are
required to compare the optimized feature extraction methods with the baseline configurations used in Task 10. Evaluate the
results using metrics such as accuracy, precision, recall, and F1-score, and provide insights into how adjusting the NLP-based
features influences the overall sentiment prediction performance.

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize


In [None]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()

In [None]:
# Check data size
print("Dataset Size:")
print(len(df))

In [None]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

## 3000 Features

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=3000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Unigram Feature Shape with 3000 features:", X_bow_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Bigram

In [None]:
# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=3000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Bigram Feature Shape with 3000 features:", X_bow_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Trigram

In [None]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=3000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Trigram Feature Shape with 3000 features:", X_bow_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

## 5000 Features

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=5000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Unigram Feature Shape with 5000 features:", X_bow_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Bigram

In [None]:
vectorizer_bow_bigram = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 5000 features:", X_bow_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Trigram

In [None]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=5000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 5000 features:", X_bow_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))


## 7000 Features

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_bow_unigram = CountVectorizer(max_features=7000)
X_bow_unigram = vectorizer_bow_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW unigram Feature Shape with 7000 features:", X_bow_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using unigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using unigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using unigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Bigram

In [None]:
# Bigram
vectorizer_bow_bigram = CountVectorizer(max_features=7000, ngram_range=(1, 2))
X_bow_bigram = vectorizer_bow_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 7000 features:", X_bow_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)


In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using bigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using bigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using bigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))


### Trigram

In [None]:
# Trigram
vectorizer_bow_trigram = CountVectorizer(max_features=7000, ngram_range=(1, 3))
X_bow_trigram = vectorizer_bow_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 7000 features:", X_bow_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)


In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW using trigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW using trigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW using trigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))