In [105]:
%load_ext autoreload
%autoreload 2
import os

while not os.getcwd().endswith("text-drift-generator"):
    os.chdir("..")
    print(os.getcwd())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [106]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

from scripts.enums import WordErrorType
from scripts.preprocess import preprocess_text
from scripts.drift_generator import simulate_drift
from scripts.constants import EXAMPLE_STRING

# Test simulate error


In [107]:
# Example Usage
text = EXAMPLE_STRING
error_types = [
    WordErrorType.TYPOGRAPHICAL,
    WordErrorType.PHONETIC,
    WordErrorType.MIX,
]

levels = [0.1, 0.5, 0.9]

for error_type in error_types:
    print(f"\nSimulating {error_type} errors:")
    for level in levels:
        drifted_text = simulate_drift(text, error_type, level)
        print(f"Stage {level}: {drifted_text}")


Simulating typographical errors:
Stage 0.1: Artificial intellignce is revolutionizing the way we live and work. It is transforming industries yb automating tasks, impzproving efficiency, and enhancing decision-making. As AI technology evolves, it opens up new possibilities for businesses and individuals avlike. However, challenges remain in ensuring ethical use, data privacy, and fairness. As we move forward, it's crucial to balance innovation with responsibility to maximize the benefits while minimizing ridks.
Stage 0.5: Ahrtifuial intelligence is gevuolutonizing the ay we liv eagnd work. It ist ransforming industqries by automating taaks, improving efficeinc,y ad enhancijg decjsion-makngi. As Au technology evopve,si t opens up new posziblites for vbusinesses and iyndividualss azlgike. However, challengew remain iln ensuring ethical uet, data privacy, gand fairnecss. A we move forward, it's crucial to balance innovaion wigh respjonsibilityv to mazomize the benehfits wile minimizing r

# News dataset


In [108]:
categories = [
    "comp.graphics",
    "rec.autos",
    "rec.sport.baseball",
    "rec.sport.hockey",
    "sci.med",
    "talk.politics.misc",
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['comp.graphics', 'rec.autos', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.med', 'talk.politics.misc']


In [109]:
df = fetch_20newsgroups(
    subset="all",
    categories=categories,
    shuffle=False,
    remove=("headers", "footers", "quotes"),
)

In [110]:
levels = [0.1, 0.5, 0.9]
full_dfs = []
error_type = WordErrorType.TYPOGRAPHICAL

## Apply drift


In [111]:
for level in levels:
    print(f"LEVEL {level}")

    # Preprocess text data and remove empty samples
    original_texts = []
    original_preprocessed_texts = []
    drifted_texts = []
    drifted_preprocessed_texts = []
    targets = []

    for text, target in zip(df.data, df.target):
        # Aplly drift to data
        drifted_text = simulate_drift(text, error_type, level=level)
        if drifted_text:  # Check if text is not empty after preprocessing
            original_preprocessed_text = preprocess_text(text)
            drifted_preprocessed_text = preprocess_text(drifted_text)

            if (
                original_preprocessed_text and drifted_preprocessed_text
            ):  # Check if text is not empty after preprocessing
                original_texts.append(text)
                original_preprocessed_texts.append(original_preprocessed_text)
                drifted_texts.append(drifted_text)
                drifted_preprocessed_texts.append(drifted_preprocessed_text)
                targets.append(target)

    # Create a new DataFrame with preprocessed data
    full_df = pd.DataFrame(
        {
            "original_data": original_texts,
            "original_preprocessed_data": original_preprocessed_texts,
            "drifted_data": drifted_texts,
            "drifted_preprocessed_data": drifted_preprocessed_texts,
            "target": targets,
        }
    )
    full_dfs.append(full_df)
    print("Done")

LEVEL 0.1
Done
LEVEL 0.5
Done
LEVEL 0.9
Done


In [112]:
print(f"ORIGINAL TEXT:\n{full_dfs[0].iat[0,0]} \n")
print(f"ORIGINAL PREPROCESSED TEXT:\n{full_dfs[0].iat[0,1]} \n")

for i in range(len(levels)):
    full_df = full_dfs[i]
    print(f"LEVEL {levels[i]}")
    print(f"DRIFTED TEXT:\n{full_df.iat[0,2]} \n")
    print(f"DRIFTED PREPROCESSED TEXT\n{full_df.iat[0,3]}")
    print("-----------------------------------------------")

ORIGINAL TEXT:

morgan and guzman will have era's 1 run higher than last year, and
 the cubs will be idiots and not pitch harkey as much as hibbard.
 castillo won't be good (i think he's a stud pitcher) 

ORIGINAL PREPROCESSED TEXT:
morgan guzman era's 1 run higher last year cub idiot pitch harkey much hibbard castillo good think stud pitcher 

LEVEL 0.1
DRIFTED TEXT:

morgan and guzman ill have era's 1 run higher than last year, and
 the cubs will be idiots and no tpitch harkey as much pzas hibbard.
 castillo won't be good (i think he's a stud pitcher) 

DRIFTED PREPROCESSED TEXT
morgan guzman ill era's 1 run higher last year cub idiot tpitch harkey much pzas hibbard castillo good think stud pitcher
-----------------------------------------------
LEVEL 0.5
DRIFTED TEXT:

hmorgq and guzman will havf eura's1 run higherthan lasty egar, and
 the cbus will be idiots aqnd not pitch harkey aa much as hibbard.
 acstlilowon't be yood (j think h'se a stus pitcher) 

DRIFTED PREPROCESSED TEXT
hm

## Vectorize


In [None]:
# Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare your data in batches for more efficient encoding
batch_size = 32  # You can adjust this based on your system's capacity

X_drifted_list = []
X_list = []

for i in range(len(full_dfs)):
    print(f"start vectorizing original {levels[i]}")
    # Encode in batches
    original_data = full_dfs[i].original_preprocessed_data
    X = model.encode(original_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_list.append(X)
    np.savetxt(f'/data/drifted_data/original_{levels[i]}.csv', X, delimiter=',')

    print(f"start vectorizing drifted {levels[i]}")
    # Encode drifted data in batches
    drifted_data = full_dfs[i].drifted_preprocessed_data
    X_drifted = model.encode(drifted_data, batch_size=batch_size, show_progress_bar=True, device='cpu')  # Ensure CPU usage
    X_drifted_list.append(X_drifted)
    np.savetxt(f'/data/drifted_data/drifted_{levels[i]}.csv', X_drifted, delimiter=',')

start vectorizing original 0.1


Batches: 100%|██████████| 173/173 [01:56<00:00,  1.48it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/data/drifted_/data/original_0.1.csv'

## Evaluate


In [None]:
# Assuming X is your reference dataset and X_drifted is a list of drifted datasets
cosine_similarities_list = []  # List to store cosine similarities for each drifted dataset
labels = []  # Labels for the boxplots

for idx in len(range(full_dfs)):  # Loop through each drifted dataset
    cosine_similarities = [
        cosine_similarity([X_list[idx][i]], [X_drifted[idx][i]])[0][0] for i in range(len(X_list[idx]))
    ]
    cosine_similarities_list.append(cosine_similarities)
    labels.append(f"Drift level {idx}")  # Label each drifted dataset
    average_similarity = np.mean(cosine_similarities)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")

# Plot the boxplots for each drifted dataset
plt.figure(figsize=(10, 6))
sns.boxplot(data=cosine_similarities_list)
plt.xticks(ticks=range(len(labels)), labels=labels)
plt.title("Cosine Similarity Distribution for Different Drifted Datasets")
plt.ylabel("Cosine Similarity")
plt.xlabel("Drifted Datasets")
plt.show()

Average Cosine Similarity: 1.0000


IndexError: index 5533 is out of bounds for axis 0 with size 5533