# Imports

In [17]:
%load_ext autoreload
%autoreload 2
import os

while not os.getcwd().endswith("text-drift-generator"):
    os.chdir("..")
    print(os.getcwd())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import fetch_20newsgroups

from sentence_transformers import SentenceTransformer

from scripts.enums import WordErrorType
from scripts.preprocess import preprocess_text
from scripts.drift_generator import simulate_drift
from scripts.utils import generate_drifted_text, plot_cosine_similarity
from scripts.constants import EXAMPLE_STRING

# Test simulate error


In [19]:
# Example Usage
text = EXAMPLE_STRING
error_types = [
    WordErrorType.TYPOGRAPHICAL,
    WordErrorType.PHONETIC,
    WordErrorType.MIX,
]

levels = [0.1, 0.5, 0.9]

for error_type in error_types:
    print(f"\nSimulating {error_type} errors:")
    for level in levels:
        drifted_text = simulate_drift(text, error_type, level)
        print(f"Stage {level}: {drifted_text}")


Simulating typographical errors:
Stage 0.1: Artificial intellignce is revolutionizing the way we live and work. It is transforming industries yb automating tasks, impzproving efficiency, and enhancing decision-making. As AI technology evolves, it opens up new possibilities for businesses and individuals avlike. However, challenges remain in ensuring ethical use, data privacy, and fairness. As we move forward, it's crucial to balance innovation with responsibility to maximize the benefits while minimizing ridks.
Stage 0.5: Ahrtifuial intelligence is gevuolutonizing the ay we liv eagnd work. It ist ransforming industqries by automating taaks, improving efficeinc,y ad enhancijg decjsion-makngi. As Au technology evopve,si t opens up new posziblites for vbusinesses and iyndividualss azlgike. However, challengew remain iln ensuring ethical uet, data privacy, gand fairnecss. A we move forward, it's crucial to balance innovaion wigh respjonsibilityv to mazomize the benehfits wile minimizing r

# News dataset


In [20]:
categories = [
    "comp.graphics",
    "rec.autos",
    "rec.sport.baseball",
    "rec.sport.hockey",
    "sci.med",
    "talk.politics.misc",
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['comp.graphics', 'rec.autos', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.med', 'talk.politics.misc']


In [21]:
df = fetch_20newsgroups(
    subset="all",
    categories=categories,
    shuffle=False,
    remove=("headers", "footers", "quotes"),
)

In [30]:
levels = [0.1, 0.5, 0.9]
typo_full_dfs = []
pho_full_dfs = []
mix_full_dfs = []
error_types = [WordErrorType.TYPOGRAPHICAL, WordErrorType.PHONETIC, WordErrorType.MIX]

## Apply drift


In [31]:
for level in levels:
    drifted_df = generate_drifted_text(df=df, error_type=WordErrorType.TYPOGRAPHICAL, level=level, print_info=True)
    typo_full_dfs.append(drifted_df)
    drifted_df = generate_drifted_text(df=df, error_type=WordErrorType.PHONETIC, level=level, print_info=True)
    pho_full_dfs.append(drifted_df)
    drifted_df = generate_drifted_text(df=df, error_type=WordErrorType.MIX, level=level, print_info=True)
    mix_full_dfs.append(drifted_df)


Simulating typographical errors at level 0.1:
Done in 6.776171684265137 s

Simulating phonetic errors at level 0.1:
Done in 7.4782938957214355 s

Simulating mix errors at level 0.1:
Done in 7.919825792312622 s

Simulating typographical errors at level 0.5:
Done in 7.7730793952941895 s

Simulating phonetic errors at level 0.5:
Done in 8.411454916000366 s

Simulating mix errors at level 0.5:
Done in 8.244817018508911 s

Simulating typographical errors at level 0.9:
Done in 8.169755220413208 s

Simulating phonetic errors at level 0.9:
Done in 9.791260242462158 s

Simulating mix errors at level 0.9:
Done in 10.498154163360596 s


## Vectorize


In [33]:
# Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare your data in batches for more efficient encoding
batch_size = 32  # You can adjust this based on your system's capacity

X_drifted_list = []
X_list = []

for i in range(len(levels)):
    print(f"start vectorizing typographical original {levels[i]}")
    # Encode in batches
    original_data = typo_full_dfs[i].original_preprocessed_data
    X = model.encode(original_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_list.append(X)
    np.savetxt(f'data/drifted_data/typographical/original_{levels[i]}.csv', X, delimiter=',')

    print(f"start vectorizing typographical drifted {levels[i]}")
    # Encode drifted data in batches
    drifted_data = typo_full_dfs[i].drifted_preprocessed_data
    X_drifted = model.encode(drifted_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_drifted_list.append(X_drifted)
    np.savetxt(f'data/drifted_data/typographical/drifted_{levels[i]}.csv', X_drifted, delimiter=',')

    print(f"start vectorizing phonetic original {levels[i]}")
    # Encode in batches
    original_data = pho_full_dfs[i].original_preprocessed_data
    X = model.encode(original_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_list.append(X)
    np.savetxt(f'data/drifted_data/phonetic/original_{levels[i]}.csv', X, delimiter=',')

    print(f"start vectorizing phonetic drifted {levels[i]}")
    # Encode drifted data in batches
    drifted_data = pho_full_dfs[i].drifted_preprocessed_data
    X_drifted = model.encode(drifted_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_drifted_list.append(X_drifted)
    np.savetxt(f'data/drifted_data/phonetic/drifted_{levels[i]}.csv', X_drifted, delimiter=',')

    print(f"start vectorizing mix original {levels[i]}")
    # Encode in batches
    original_data = mix_full_dfs[i].original_preprocessed_data
    X = model.encode(original_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_list.append(X)
    np.savetxt(f'data/drifted_data/mix/original_{levels[i]}.csv', X, delimiter=',')

    print(f"start vectorizing mix drifted {levels[i]}")
    # Encode drifted data in batches
    drifted_data = mix_full_dfs[i].drifted_preprocessed_data
    X_drifted = model.encode(drifted_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_drifted_list.append(X_drifted)
    np.savetxt(f'data/drifted_data/mix/drifted_{levels[i]}.csv', X_drifted, delimiter=',')

start vectorizing typographical original 0.1


Batches: 100%|██████████| 173/173 [01:46<00:00,  1.62it/s]


start vectorizing typographical drifted 0.1


Batches: 100%|██████████| 173/173 [01:47<00:00,  1.62it/s]


start vectorizing phonetic original 0.1


Batches: 100%|██████████| 173/173 [01:42<00:00,  1.68it/s]


start vectorizing phonetic drifted 0.1


Batches: 100%|██████████| 173/173 [01:44<00:00,  1.66it/s]


start vectorizing mix original 0.1


Batches: 100%|██████████| 173/173 [01:44<00:00,  1.65it/s]


start vectorizing mix drifted 0.1


Batches: 100%|██████████| 173/173 [01:46<00:00,  1.63it/s]


start vectorizing typographical original 0.5


Batches: 100%|██████████| 173/173 [01:45<00:00,  1.64it/s]


start vectorizing typographical drifted 0.5


Batches: 100%|██████████| 173/173 [02:04<00:00,  1.39it/s]


start vectorizing phonetic original 0.5


Batches: 100%|██████████| 173/173 [01:49<00:00,  1.58it/s]


start vectorizing phonetic drifted 0.5


Batches: 100%|██████████| 173/173 [01:46<00:00,  1.63it/s]


start vectorizing mix original 0.5


Batches: 100%|██████████| 173/173 [01:50<00:00,  1.56it/s]


start vectorizing mix drifted 0.5


Batches: 100%|██████████| 173/173 [01:54<00:00,  1.51it/s]


start vectorizing typographical original 0.9


Batches: 100%|██████████| 173/173 [01:56<00:00,  1.48it/s]


start vectorizing typographical drifted 0.9


Batches: 100%|██████████| 173/173 [02:16<00:00,  1.27it/s]


start vectorizing phonetic original 0.9


Batches: 100%|██████████| 173/173 [01:50<00:00,  1.56it/s]


start vectorizing phonetic drifted 0.9


Batches: 100%|██████████| 173/173 [01:44<00:00,  1.66it/s]


start vectorizing mix original 0.9


Batches: 100%|██████████| 173/173 [01:48<00:00,  1.59it/s]


start vectorizing mix drifted 0.9


Batches: 100%|██████████| 173/173 [01:53<00:00,  1.53it/s]


## Evaluate


In [43]:
# Import the numoy arrays containing original and drifted data

levels = [0.1, 0.5, 0.9]

X_typo_list = []
X_typo_drifted_list = []
X_pho_list = []
X_pho_drifted_list = []
X_mix_list = []
X_mix_drifted_list = []

for level in levels:
    X = np.genfromtxt(f'data/drifted_data/typographical/original_{level}.csv', delimiter=',')
    X_typo_list.append(X)
    X_typo_drifted = np.genfromtxt(f'data/drifted_data/typographical/drifted_{level}.csv', delimiter=',')
    X_typo_drifted_list.append(X_drifted)

    X = np.genfromtxt(f'data/drifted_data/phonetic/original_{level}.csv', delimiter=',')
    X_pho_list.append(X)
    X_pho_drifted = np.genfromtxt(f'data/drifted_data/phonetic/drifted_{level}.csv', delimiter=',')
    X_pho_drifted_list.append(X_drifted)

    X = np.genfromtxt(f'data/drifted_data/mix/original_{level}.csv', delimiter=',')
    X_mix_list.append(X)
    X_mix_drifted = np.genfromtxt(f'data/drifted_data/mix/drifted_{level}.csv', delimiter=',')
    X_mix_drifted_list.append(X_drifted)

In [None]:
plot_cosine_similarity(
    levels=levels,
    error_type=WordErrorType.TYPOGRAPHICAL,
    X_list=X_typo_list,
    X_drifted_list=X_typo_drifted_list,
)
plot_cosine_similarity(
    levels=levels,
    error_type=WordErrorType.PHONETIC,
    X_list=X_pho_list,
    X_drifted_list=X_pho_drifted_list,
)
plot_cosine_similarity(
    levels=levels,
    error_type=WordErrorType.MIX,
    X_list=X_mix_list,
    X_drifted_list=X_mix_drifted_list,
)

Average Cosine Similarity at level 0.1: 0.8500
Average Cosine Similarity at level 0.5: 0.8500
Average Cosine Similarity at level 0.9: 0.8500
Average Cosine Similarity at level 0.1: 0.8500
Average Cosine Similarity at level 0.5: 0.8500
Average Cosine Similarity at level 0.9: 0.8500
Average Cosine Similarity at level 0.1: 0.8500
Average Cosine Similarity at level 0.5: 0.8500
Average Cosine Similarity at level 0.9: 0.8500
