In [1]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [21]:
def encode_text(model, tokenizer, text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.flatten(start_dim=1)

In [3]:
def find_cosine_similarity(embeddings):
    embeddings = embeddings.detach().numpy()
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

In [18]:
model_name = "BAAI/bge-small-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

In [6]:
from pathlib import Path

INPUT_DIR = Path("../data/")

def load_data(file_name: str) -> pd.DataFrame:
    return pd.read_csv(INPUT_DIR / file_name, sep=',')

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={'generated': 'label'})
    return df

train_df = load_data("train_essays_RDizzl3_seven_v1.csv")
# test_df = load_data("llm-detect-ai-generated-text/test_essays.csv")
external_df = load_data("daigt-external-dataset/daigt_external_dataset.csv")
external_df_gpt_4 = load_data("gpt-4/ai_generated_train_essays_gpt-4.csv")
external_df_gpt_3_5 = load_data("gpt-4/ai_generated_train_essays.csv")

train_df = preprocess_data(train_df)
# test_df = preprocess_data(test_df)
external_df = preprocess_data(external_df)
external_df_gpt_4 = preprocess_data(external_df_gpt_4)
external_df_gpt_3_5 = preprocess_data(external_df_gpt_3_5)

external_df = external_df[["id", "source_text"]]
external_df.columns = ["id", "text"]
external_df['text'] = external_df['text'].str.replace('\n', '')
external_df["label"] = 1

external_df_gpt_3_5 = external_df_gpt_3_5[["id", "text", "label"]]
external_df_gpt_4 = external_df_gpt_4[["id", "text", "label"]]

train_df = pd.concat([train_df, external_df, external_df_gpt_3_5, external_df_gpt_4])
train_df.reset_index(inplace=True, drop=True)

In [8]:
train_df["label"].value_counts()

label
0    14247
1     4741
Name: count, dtype: int64

In [22]:
from tqdm import tqdm

embeddings_cache = []
for text in tqdm(train_df['text']):
    embeddings_cache.append(encode_text(model, tokenizer, text))


100%|██████████| 18988/18988 [15:41<00:00, 20.18it/s]


In [24]:
min_s = float("inf")
for e in embeddings_cache:
    _, x = e.size()
    min_s = min(min_s, x)

embeddings = torch.stack([x[:, :min_s] for x in embeddings_cache])

In [25]:
import pickle

# Save embeddings
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [26]:
similarity_matrix = find_cosine_similarity(embeddings.squeeze())

In [None]:
# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix)

# Save DataFrame as a CSV file
similarity_df.to_csv('similarity_matrix.csv', index=False)

In [None]:
print(embeddings.squeeze())

tensor([[-0.0658, -0.0051,  0.2925,  ..., -0.1971,  0.0925, -0.0801],
        [-0.1036, -0.0420,  0.2502,  ..., -0.1841,  0.0200,  0.0142],
        [-0.2441, -0.0084,  0.2667,  ..., -0.2447, -0.0320,  0.0709],
        ...,
        [-0.4004,  0.0520,  0.1095,  ..., -0.1828, -0.0750,  0.2341],
        [-0.3682,  0.0582,  0.1055,  ..., -0.2123, -0.0902,  0.3081],
        [-0.3774,  0.0214,  0.1160,  ..., -0.1786, -0.0731,  0.3472]])


In [None]:
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18978,18979,18980,18981,18982,18983,18984,18985,18986,18987
0,1.000000,0.980377,0.953450,0.964251,0.955121,0.902250,0.898201,0.977665,0.924432,0.908571,...,0.883900,0.874066,0.869140,0.895648,0.885033,0.888180,0.875624,0.880364,0.881157,0.874225
1,0.980377,1.000000,0.965684,0.968754,0.949402,0.896032,0.898882,0.972291,0.916763,0.911673,...,0.904814,0.893297,0.888036,0.911304,0.905047,0.909399,0.899104,0.902572,0.902285,0.898123
2,0.953450,0.965684,1.000000,0.970721,0.932136,0.892376,0.893496,0.962282,0.912440,0.913081,...,0.912915,0.896973,0.889751,0.918671,0.914280,0.917474,0.904211,0.918049,0.906072,0.897587
3,0.964251,0.968754,0.970721,1.000000,0.950835,0.894797,0.898959,0.965639,0.924111,0.913108,...,0.889760,0.876750,0.870924,0.897916,0.890625,0.895828,0.881382,0.890384,0.886686,0.881727
4,0.955121,0.949402,0.932136,0.950835,1.000000,0.907739,0.882868,0.962430,0.923434,0.895579,...,0.868458,0.853987,0.844398,0.877426,0.860563,0.868176,0.855701,0.863463,0.857391,0.850059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18983,0.888180,0.909399,0.917474,0.895828,0.868176,0.916860,0.948456,0.890031,0.930657,0.956759,...,0.988086,0.984921,0.985763,0.985548,0.983420,1.000000,0.987349,0.989279,0.984894,0.985153
18984,0.875624,0.899104,0.904211,0.881382,0.855701,0.905982,0.943565,0.875099,0.914600,0.948409,...,0.985127,0.979742,0.987599,0.983106,0.974627,0.987349,1.000000,0.988516,0.987691,0.986637
18985,0.880364,0.902572,0.918049,0.890384,0.863463,0.906959,0.942916,0.883161,0.919602,0.952539,...,0.983577,0.977437,0.983359,0.985240,0.975453,0.989279,0.988516,1.000000,0.984768,0.983444
18986,0.881157,0.902285,0.906072,0.886686,0.857391,0.905537,0.955590,0.875980,0.920469,0.955475,...,0.981429,0.983604,0.987412,0.985727,0.981420,0.984894,0.987691,0.984768,1.000000,0.987459


In [17]:
train_df.head(2)

Unnamed: 0,text,label,id
0,Cars. Cars have been around since they became ...,0,
1,Transportation is a large necessity in most co...,0,


In [27]:
similarity_matrix

array([[1.0000002 , 0.75881207, 0.59272695, ..., 0.32630736, 0.40573612,
        0.42545748],
       [0.75881207, 1.0000001 , 0.5744054 , ..., 0.32749844, 0.41742715,
        0.4284206 ],
       [0.59272695, 0.5744054 , 1.0000002 , ..., 0.4013688 , 0.36186668,
        0.44769996],
       ...,
       [0.32630736, 0.32749844, 0.4013688 , ..., 0.99999994, 0.6303203 ,
        0.7013615 ],
       [0.40573612, 0.41742715, 0.36186668, ..., 0.6303203 , 0.9999995 ,
        0.69305617],
       [0.42545748, 0.4284206 , 0.44769996, ..., 0.7013615 , 0.69305617,
        0.9999998 ]], dtype=float32)

In [36]:
is_similar = similarity_matrix >= 0.98
is_similar

array([[ True, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       ...,
       [False, False, False, ...,  True, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False,  True]])

In [37]:
from collections import defaultdict
R, C = len(is_similar), len(is_similar[0])
cache = defaultdict(set)
for i in range(R):
    for j in range(i + 1, C):
        if is_similar[i, j]:
            cache[i].add(j)

In [43]:
print(max(cache, key=lambda x: len(cache[x])))
print(cache[1517])

1517
{18304, 1537, 18435, 18436, 18309, 18439, 1546, 18447, 1554, 18323, 18456, 18329, 1562, 18334, 18464, 18343, 1576, 1577, 18472, 1581, 1585, 18486, 18487, 1592, 18491, 18495, 1600, 18369, 18502, 18378, 1611, 18510, 18385, 1618, 1621, 18521, 18399, 18528, 18401, 18402, 18531, 18404, 1519, 1520, 18289, 18418, 1525, 1526, 18293, 18422, 1529, 18427, 18429, 18430}


In [44]:
train_df.loc[1517]["text"]

'Advantages of Limiting Car Usage\n\nLimiting car usage can have several advantages for both individuals and the environment. As evidenced by the examples of Vauban in Germany, Paris, and Bogota, there are significant benefits to reducing car dependency and promoting alternative modes of transportation. These advantages include improved air quality, reduced greenhouse gas emissions, less traffic congestion, and healthier and more sustainable communities.\n\nOne of the most immediate benefits of limiting car usage is improved air quality. In many cities, such as Paris, smog levels have reached dangerous levels due to car emissions. By implementing driving bans like the one enforced in Paris, the number of cars on the road decreases, leading to a significant reduction in air pollution. This not only protects the health of residents but also helps combat climate change by reducing greenhouse gas emissions.\n\nSpeaking of greenhouse gas emissions, limiting car usage is essential in the fig

In [45]:
train_df.loc[18304]["text"]

'Advantages of Limiting Car Usage\n\nLimiting car usage has numerous advantages for both individuals and the environment. This is evident in the examples of Vauban, Paris, and Bogota, where innovative approaches to transportation have had positive effects. By reducing car dependency, these communities have managed to improve air quality, reduce traffic congestion, and promote alternative modes of transportation.\n\nOne of the main advantages of limiting car usage is the positive impact on the environment. According to the passage, passenger cars are responsible for a significant portion of greenhouse gas emissions. In Europe, cars account for 12 percent of these emissions, and in car-intensive areas in the United States, they contribute up to 50 percent. By reducing the number of cars on the road, as seen in Vauban and during the car-free day in Bogota, these communities are effectively reducing their carbon footprint and helping to combat climate change.\n\nAdditionally, limiting car 

In [46]:
train_df.loc[1537]["text"]

'Advantages of Limiting Car Usage\n\nLimiting car usage can have numerous advantages for both individuals and society as a whole. By reducing car dependency, communities can experience improved air quality, decreased traffic congestion, and increased accessibility to public transportation. This can contribute to a healthier environment and enhanced quality of life for residents.\n\nOne of the main advantages of limiting car usage is the reduction in greenhouse gas emissions. Passenger cars are responsible for a significant percentage of these emissions, with Europe attributing 12% and some car-intensive areas in the United States attributing up to 50% of their emissions to cars (Rosenthal). By promoting alternatives to car travel, such as walking, biking, and public transportation, communities can help combat climate change and improve air quality.\n\nAnother benefit of limiting car usage is the reduction in traffic congestion. Cities that have implemented car-free zones or restricted 

In [35]:
similarity_matrix[239, 13958]

0.9658046

In [47]:
all_set = set()
for key, curr_set in cache.items():
    all_set = all_set.union(curr_set)

print(all_set)

{18791, 18792, 18793, 18794, 18795, 18796, 18797, 18798, 18799, 18801, 18802, 18803, 18804, 18806, 18808, 18809, 18812, 18813, 18817, 18818, 18820, 18821, 18823, 18825, 18827, 18831, 18834, 18835, 18836, 18837, 18838, 18839, 18840, 18842, 18843, 18844, 18845, 18846, 12909, 12910, 18848, 12916, 18850, 12923, 12924, 12932, 12936, 12937, 12941, 18855, 12947, 12950, 12952, 12953, 12955, 12956, 18857, 12960, 12961, 18858, 12963, 12965, 18859, 12967, 18860, 12971, 12973, 18861, 12979, 12983, 18864, 18865, 18866, 18867, 18868, 13014, 18869, 13019, 18870, 18871, 13026, 13028, 13030, 18872, 18873, 18874, 13042, 13045, 18875, 13048, 18876, 13052, 13053, 13054, 18877, 13056, 13059, 13060, 13063, 13065, 18879, 13068, 13071, 13074, 13076, 13078, 13081, 13085, 13086, 13087, 13090, 13094, 13095, 13096, 18885, 18886, 13106, 13118, 13123, 18891, 18892, 13136, 18893, 13146, 13151, 13152, 13156, 13162, 18899, 13167, 13175, 13178, 13179, 13181, 18902, 13184, 18903, 13188, 13193, 13194, 13198, 13206, 13207

In [48]:
len(all_set)

1820

In [49]:
cache

defaultdict(set,
            {0: {2086},
             1: {2989},
             2: {3320},
             3: {3188},
             4: {2948},
             5: {12937},
             6: {14636},
             7: {2640},
             9: {13085},
             10: {13638},
             11: {3489},
             12: {13891},
             13: {13490},
             14: {3805},
             15: {2636},
             16: {14260},
             17: {2131},
             18: {2550},
             19: {3130},
             20: {14805},
             22: {2460},
             24: {13063},
             25: {2091},
             27: {2956},
             28: {13230},
             29: {3317},
             31: {2238},
             32: {14655},
             35: {2437},
             36: {2184},
             37: {2634},
             39: {14710},
             40: {2842},
             41: {13167},
             42: {13773},
             43: {13240},
             44: {2291},
             45: {2828},
             46: {3975},
  

In [50]:
new_train_df = train_df.drop(index=all_set)

In [51]:
len(train_df) - len(new_train_df)

1820

In [54]:
def cleaning(dataset):
    
    dataset['text'] = dataset['text'].str.strip()
    dataset["text"] = dataset["text"].replace('\\n',' ')
    dataset["text"] = dataset["text"].str.split('ubject: ').str[-1].str.strip()
    dataset["text"] = dataset["text"].str.split('Zip').str[-1].str.strip()
    dataset["text"] = dataset["text"].str.split('ZIP').str[-1].str.strip()
    
    dataset = dataset.rename(columns = {'generated':'label'})
    
    return dataset

In [55]:
new_train_df = cleaning(new_train_df)

In [1]:
import random

def introduce_typo(text, error_rate=0.05):
    new_text = list(text)
    errors = int(len(text) * error_rate)
    
    for _ in range(errors):
        error_type = random.choice(['substitution', 'insertion', 'deletion', 'transposition'])
        position = random.randint(0, len(new_text) - 1)
        
        if error_type == 'substitution':
            new_text[position] = random.choice('abcdefghijklmnopqrstuvwxyz')
        elif error_type == 'insertion':
            new_text.insert(position, random.choice('abcdefghijklmnopqrstuvwxyz'))
        elif error_type == 'deletion' and len(new_text) > 1:
            new_text.pop(position)
        elif error_type == 'transposition' and position < len(new_text) - 1:
            new_text[position], new_text[position + 1] = new_text[position + 1], new_text[position]
    
    return ''.join(new_text)

In [57]:
new_train_df.to_csv("../data/clean_data.csv", index=False)

In [59]:
new_train_df["text"] = new_train_df["text"].apply(introduce_typo)
new_train_df.to_csv("../data/typo_005_data.csv", index=False)

In [9]:
import pandas as pd
import random
random.seed(42)

def introduce_typo(text, error_rate=0.05):
    new_text = list(text)
    errors = int(len(text) * error_rate)
    
    for _ in range(errors):
        error_type = random.choice(['substitution', 'insertion', 'deletion', 'transposition'])
        position = random.randint(0, len(new_text) - 1)
        
        if error_type == 'substitution':
            new_text[position] = random.choice('abcdefghijklmnopqrstuvwxyz')
        elif error_type == 'insertion':
            new_text.insert(position, random.choice('abcdefghijklmnopqrstuvwxyz'))
        elif error_type == 'deletion' and len(new_text) > 1:
            new_text.pop(position)
        elif error_type == 'transposition' and position < len(new_text) - 1:
            new_text[position], new_text[position + 1] = new_text[position + 1], new_text[position]
    
    return ''.join(new_text)

def create_typo_dataset(p=0.05):
    base = pd.read_csv("../data/clean_data.csv", sep=",")
    base["text"] = base["text"].apply(lambda x: introduce_typo(x, p))
    
    base.to_csv(f"../data/typo_{str(int(p * 100)).zfill(3)}_data.csv", index=False)
    
create_typo_dataset(0.25)