### Import libraries

In [7]:
import pandas as pd
import os
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

### Load data function


In [None]:
def load_data(directory):
    texts, labels = [], []
    for label in ['pos', 'neg']:
        folder = f"{directory}/{label}"
        for file in os.listdir(folder):
            with open(f"{folder}/{file}", 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if label == 'pos' else 0)
    return pd.DataFrame({'text': texts, 'label': labels})

##### Load only a quarter of the whole data for faster results

In [None]:
# the whole data with 50,000 files
""" data = pd.concat([
    load_data("aclImdb/train"),
    load_data("aclImdb/test")
], ignore_index=True) """

# a separate folder, with fewer data comprising only 12,500 files
data = load_data("aclImdb/elite")

### Text preprocessing

Download and add some stuff to start off

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Define a function for preprocessing the text

In [10]:
def preprocess(text):
    text = re.sub(r'<.*?>', '', text)  # حذف HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # حذف URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # حذف کاراکترهای غیر الفبایی
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

### Apply the 'preprocess' function

In [None]:
# data['clean_text'] = data['text'].apply(preprocess)

from tqdm import tqdm

tqdm.pandas()

# The same 'apply' function but with showing the progress bar
data['clean_text'] = data['text'].progress_apply(preprocess)

100%|██████████| 12500/12500 [10:37<00:00, 19.60it/s]


### Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['label'], test_size=0.2, random_state=42
)

### Vectorize the data with TF-IDF method
I used this method due to applying a classic model, 'Logistic regression', on the dataset.

For small datasets and lightweight algorithms, it's better to use such faster and simpler embedding approaches

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Logistic regression

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

### Evaluate the model

In [None]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87      1215
           1       0.87      0.90      0.88      1285

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500

