### Making arrangements

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

### Import tokenizer

In [None]:
from nltk.tokenize import word_tokenize

# Download word tokenizer
nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)


### Load embedding

In [3]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_path = 'glove.6B.50d.txt'  # Ensure this file is downloaded
embeddings_index = load_glove_embeddings(glove_path)

def text_to_embedding(tokens, embeddings_index, embedding_dim=50):
    embedding_matrix = []
    for token in tokens:
        if token in embeddings_index:
            embedding_matrix.append(embeddings_index[token])
        else:
            embedding_matrix.append(np.zeros(embedding_dim))  # For unknown words
    return np.array(embedding_matrix)


### Load IMDB movie reviews dataset

In [None]:
import pandas as pd

dataset = pd.read_csv("IMDB_Dataset.csv")
print(dataset)

### Split comments and labels

In [None]:
texts, labels = zip(*dataset.values)

In [None]:
print(type(texts), type(labels))

#### Test clean_text function

In [None]:
for text in texts[0:5]:
    print(text, "--->", clean_text(text))

### An additional and essential package
Without this package I would get an error. How do I figure out to download this package too? With running it on Google Colab and debugging

In [None]:
nltk.download('punkt_tab')

### Clean and tokenize the text

In [None]:
cleaned_texts = [clean_text(text) for text in texts]
tokenized_texts = [tokenize_text(text) for text in cleaned_texts]

### Convert to embeddings

In [None]:
embedded_texts = [np.mean(text_to_embedding(tokens, embeddings_index), axis=0) for tokens in tokenized_texts]
print(embedded_texts)

### Split data into trainset and testset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Prepare data for training
X_train, X_test, y_train, y_test = train_test_split(embedded_texts, labels, test_size=0.25, random_state=42)

Extra: To find out the dimensions of the sets

In [None]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

Extra: Conduct a series of tests to determine the correctness of the steps

In [None]:
# Check if features are meaningful => If all features are zeros, it means embeddings weren’t computed correctly
print(np.mean(X_train, axis=0))

### Train a logistic regression model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

Extra: Conduct a series of tests to determine the correctness of the steps (2)

In [None]:
# If the weights are near zero, the model isn’t learning due to poor data quality or features
print(model.coef_)  # Model weights
print(model.intercept_)  # Model intercept

### Test the model

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
print(y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")