In [6]:
import os
import glob
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Text cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    text = text.lower()  # Convert to lowercase
    return text

# Load data
def load_reviews(path):
    texts, labels = [], []
    
    # Check if directory exists
    if not os.path.exists(path):
        print(f"Path {path} does not exist, please check!")
        return texts, labels
    
    print(f"Loading path: {path}")
    print(f"Subdirectories: {os.listdir(path)}")
    
    for label in ['pos', 'neg']:
        label_path = os.path.join(path, label)
        if not os.path.exists(label_path):
            print(f"Warning: Subdirectory {label_path} does not exist, skipping this category")
            continue
        files = glob.glob(os.path.join(label_path, '*.txt'))
        print(f"Number of files read for category {label}: {len(files)}")
        
        for file in files:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
                cleaned = clean_text(text)
                texts.append(cleaned)
                labels.append(1 if label == 'pos' else 0)
    return texts, labels

# Main process
texts, labels = load_reviews('aclImdb/train')

if len(texts) == 0:
    print("No texts loaded, terminating program.")
else:
    # Split train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Feature extraction
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)

    # Train model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_tfidf, y_train)

    # Validate model
    y_pred = clf.predict(X_val_tfidf)
    print('Validation Accuracy:', accuracy_score(y_val, y_pred))


Loading path: aclImdb/train
Subdirectories: ['urls_unsup.txt', 'neg', 'urls_pos.txt', 'unsup', 'urls_neg.txt', 'pos', 'unsupBow.feat', 'labeledBow.feat']
Number of files read for category pos: 12500
Number of files read for category neg: 12500
Validation Accuracy: 0.8736


### What the above code does can be summarized as follows:

1. **Data Loading and Cleaning**

   * It loads all text files from the folders `pos` (positive reviews) and `neg` (negative reviews) inside the directory `aclImdb/train`.
   * Each review is cleaned by removing HTML tags, keeping only letters and spaces, and converting all text to lowercase.

2. **Data Preparation**

   * The cleaned texts are stored in a list called `texts`, and their corresponding labels (1 for positive, 0 for negative) are stored in `labels`.

3. **Data Splitting**

   * The data is split into training and validation sets, with 80% used for training and 20% for validation.

4. **Feature Extraction**

   * The text data is transformed into TF-IDF feature vectors using `TfidfVectorizer`, limited to the top 5000 features.

5. **Model Training**

   * A logistic regression model (`LogisticRegression`) is trained on the training set features and labels.

6. **Model Evaluation**

   * The model predicts labels for the validation set, and the accuracy is calculated and printed.

---

**Your output means:**

* The program successfully loaded 12,500 positive and 12,500 negative reviews (25,000 total).
* The logistic regression model achieved about 87.36% accuracy on the validation set, indicating good performance on the sentiment classification task.

In short, this code builds and evaluates a simple sentiment analysis model based on TF-IDF features and logistic regression.
