In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset('imdb')
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

# Split the training data into features and labels
X_train_orig = train_df['text']
y_train_orig = train_df['label']

# Split the test data into features and labels
X_test = test_df['text']
y_test = test_df['label']

# Create an artificial imbalance by removing randomly selected positive reviews (label = 1) from the training data
positive_indices = np.where(y_train_orig == 1)[0]
negative_indices = np.where(y_train_orig == 0)[0]

# Randomly choose 80% of the positive indices to remove
np.random.seed(42)  # for reproducibility
drop_indices = np.random.choice(positive_indices, size=int(0.80*len(positive_indices)), replace=False)

# Create the unbalanced training data by dropping the selected positive reviews
X_train_unbal = X_train_orig.drop(drop_indices)
y_train_unbal = y_train_orig.drop(drop_indices)

# Shuffle the training data
X_train_orig, y_train_orig = shuffle(X_train_orig, y_train_orig, random_state=42)
X_train_unbal, y_train_unbal = shuffle(X_train_unbal, y_train_unbal, random_state=42)

# Now you have original and unbalanced training sets, and a common test set.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
y_train_unbal.value_counts()

label
0    12500
1     2500
Name: count, dtype: int64

In [7]:
y_train_orig.value_counts()

label
0    12500
1    12500
Name: count, dtype: int64


### Exercises:

1. **Dataset Exploration**:
    - Explore the original training set and the unbalanced training set. What are the proportions of positive and negative reviews in both?

2. **Text Encoding**:
    - Encode the text data from both training sets using the TF-IDF vectorization method. 

3. **Model Training and Evaluation (Unbalanced Data)**:
    - Train a Logistic Regression classifier on the unbalanced training data.
    - Evaluate the model on the common test set using appropriate metrics such as accuracy, precision, recall, and F1-score.

4. **Data Balancing**:
    - Balance the unbalanced training data using oversampling technique. Make sure the oversampling is done only on the training data.
    
5. **Model Training and Evaluation (Balanced Data)**:
    - Train a Logistic Regression classifier on the balanced training data.
    - Evaluate the model on the common test set using the same metrics as above.
    - Compare the performance of the model trained on unbalanced data versus the model trained on balanced data.
     - Repeat the training now using the original training data.

6. **Data Visualization**:
    - Use t-SNE to visualize the text encoding of both the original and balanced training data in 2D space. Color the points based on the labels.

7. **TF-IDF with PCA**
     - Use PCA to reduce the dimensionality of the TF-IDF on the balanced data. Standardize the feature matrix and get those components that explain at least 95% of the variance. Compare the results without using PCA. 

8. **(Optional)**
    - Explore other machine learning models and compare their performance with Logistic Regression on this task.
    - Consider tuning the hyperparameters of your models for better performance.

In [8]:
# 1

lab0_count, lab1_count = y_train_unbal.value_counts()
lab0_count/(lab0_count+lab1_count)

0.8333333333333334

NameError: name 'X_train' is not defined