In [1]:
import re

import numpy as np
import torch
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random.seed(42)
import num2words

In [2]:
def shuffle_string(x: str) -> str:
    s = x
    start, end = s[0], s[-1]
    s = list(s[1:-1])
    random.shuffle(s)
    s = ''.join(s)
    s = start + s + end
    return s


def get_typoglycemia_modified_data(df: pd.DataFrame) -> pd.DataFrame:
    # typo_easy = []
    # typo_hard = []
    typo = []

    for idx, row in df.iterrows():
        text = row['split'].replace('.', '').split(' ')
        # easy = row['Easy'].replace('.', '').split(' ')
        # hard = row['Hard'].replace('.', '').split(' ')

        # shuffle words
        # easy = [shuffle_string(i) if len(i) > 3 else i for i in easy]
        # hard = [shuffle_string(i) if len(i) > 3 else i for i in hard]
        text = [shuffle_string(i) if len(i) > 3 else i for i in text]

        # typo_easy.append(' '.join(easy))
        # typo_hard.append(' '.join(hard))
        typo.append(' '.join(text))

    # df['Easy_Typo'] = typo_easy
    # df['Hard_Typo'] = typo_hard
    df['typoglycemia'] = typo

    return df


def sentence_tokennizer(sentence: str) -> list:
    # Remove all non-alphabet chars
    regex = re.compile('[^a-zA-Z ]')
    sentence = regex.sub('', sentence)
    sentence = sentence.lower()
    sentence = sentence.split(' ')
    # Remove empty strings
    sentence = [i for i in sentence if len(i) != 0]
    return sentence

def sentence_preproces(x:str) -> list:
    #Remove all chars that is not a full stop, space or in the alphabet
    x = re.sub('[^a-zA-Z\s\.]', '', x)
    #Remove multiple dots
    x = re.sub('\.{2,}', ' ', x)
    #Remove . in acronymns
    x = re.sub(r'\b([a-zA-Z]\.){2,}[a-zA-Z]\b', lambda y: y.group().replace('.', ''), x)
    #Remove any lenght of spaces except 1
    x = x = re.sub('\s{2,}', ' ', x)
    #Remove dits in relation to other signs 
#    x = x.re.sub('\.-|-\.|."|".','',x) # not used right now
    return x.strip()


def char_to_index(char):
    if 'a' <= char <= 'z':
        return (ord(char) - ord('a') + 1)
    if char == " ":
        # return ord(char)
        return 26
    else:
        return 0
        
def convert_sentence_to_char_sequence(sentences: pd.Series, max_length: int, target: bool) -> torch.Tensor:

    sequences = np.zeros((len(sentences), max_length), dtype= np.float32) - 1
    
    #If target keep it as a categorical value (int)
    if target:
        sequences = np.zeros((len(sentences), max_length)) - 1

    for sentence_idx, sentence in enumerate(sentences):
        for char_idx, char in enumerate(sentence):
            if char_idx < max_length:
                sequences[sentence_idx, char_idx] = char_to_index(char.lower())
            else:
                break
    
    #If not target, make it a float
    if target == False:
        sequences = sequences/100

    return torch.Tensor(sequences)

def tokenize_dataframe(df: pd.DataFrame, complexity: str) -> pd.DataFrame:
    df.loc[:, complexity] = df[complexity].apply(lambda x: ' '.join(sentence_tokennizer(x)))
    df.loc[:, complexity + "_Typo"] = df[complexity + "_Typo"].apply(lambda x: ' '.join(sentence_tokennizer(x)))
    return df

def get_max_length(df: pd.DataFrame, complexity_level: str):
    # Combine the relevant sentence columns
    all_sentences = pd.concat([df[complexity_level], df[complexity_level + "_Typo"]])

    lengths = all_sentences.str.len()

    # Calculate statistics
    max_length = lengths.max()
    mean_length = lengths.mean()
    std_length = lengths.std()
    median_length = lengths.median()

    # Calculate the five-number summary
    min_length = lengths.min()
    q1_length = lengths.quantile(0.25)  # First quartile
    q3_length = lengths.quantile(0.75)  # Third quartile

    # Print the five-number summary
    print(
        f"Five-number summary: Min: {min_length}, Q1: {q1_length}, Median: {median_length}, Q3: {q3_length}, Max: {max_length}")
    print(f"Mean: {mean_length}, Std Dev: {std_length}")

    # Plot the distribution of lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(lengths, bins=30, kde=True, color='blue', stat='density', alpha=0.6)
    plt.axvline(mean_length, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_length:.2f}')
    plt.axvline(median_length, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median_length:.2f}')
    plt.axvline(q1_length, color='orange', linestyle='dashed', linewidth=1, label=f'Q1: {q1_length:.2f}')
    plt.axvline(q3_length, color='purple', linestyle='dashed', linewidth=1, label=f'Q3: {q3_length:.2f}')

    plt.title('Distribution of Sentence Lengths')
    plt.xlabel('Length of Sentences')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

    return max_length


# if __name__ == "__main__":
#     sentences = pd.Series(["Hello world", "test sentence"])
#     # sentences = pd.Series(["Hello world"])
#     tensor_output = convert_sentence_to_char_sequence(sentences, 30)
#     print(tensor_output)


  x = re.sub('[^a-zA-Z\s\.]', '', x)
  x = re.sub('\.{2,}', ' ', x)
  x = x = re.sub('\s{2,}', ' ', x)


In [7]:
# Function to replicate rows
def split_rows(row):
    preprocessed = sentence_preproces(row['original'])
    #split_rows = preprocessed.split(',').split('.')
    split_rows = re.split(r'[.,]', preprocessed)
    new_rows = pd.DataFrame({'original':row['original']*len(split_rows),'split': split_rows})
    new_rows['split'].replace('', np.nan, inplace=True)
    return new_rows



In [8]:
def convert_numbers_to_words(sentence):
    return re.sub(r'\b\d+\b', lambda x: num2words(int(x.group())), sentence)

In [3]:
sentences = pd.Series(["Hello world. Today is a good day", "test sentence"])
# sentences = pd.Series(["Hello world"])
tensor_output = convert_sentence_to_char_sequence(sentences, 50, True)
tensor_output

tensor([[ 8.,  5., 12., 12., 15., 26., 23., 15., 18., 12.,  4.,  0., 26., 20.,
         15.,  4.,  1., 25., 26.,  9., 19., 26.,  1., 26.,  7., 15., 15.,  4.,
         26.,  4.,  1., 25., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1.],
        [20.,  5., 19., 20., 26., 19.,  5., 14., 20.,  5., 14.,  3.,  5., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         -1., -1., -1., -1., -1., -1., -1., -1.]])

In [4]:
sentence_preproces(sentences[0])

Unnamed: 0,0
0,Hello world
1,Today is a good day


In [85]:
df = pd.read_csv("../data/raw/sscorpus.gz", sep="\t", names=["Hard", "Easy", "Similarity"])

In [9]:
def generate_typoglycemia_data_file(similarity_threshold: float, file_path: str):
    df = pd.read_csv(file_path, sep="\t", names=["Hard", "Easy", "Similarity"])
    print(df.shape)
    df = df[df["Similarity"] <= similarity_threshold]
    print(df.shape)

    #shsf
    
    df_hard = pd.DataFrame(columns=['original'])#, 'preproces', 'split', 'typoglycemia'])
    df_easy = pd.DataFrame(columns=['original'])#, 'preproces', 'split', 'typoglycemia'])
    df_hard['original'] = df['Hard']
    df_easy['original'] = df['Easy']

    #Split sentence at full stop and clean the sentences
    df_hard = pd.concat([split_rows(row) for _, row in df_hard.iterrows()], ignore_index=True)
    df_hard = df_hard[df_hard['split'].notna()]  


    df_hard['typoglycemia'] = df_hard['split'].apply(convert_numbers_to_words)
    df_hard = get_typoglycemia_modified_data(df_hard)
    df_hard.reset_index(inplace=True, drop=True)
    df_hard.to_csv("../data/processed/sscorpus_hard2.csv", index=False)

    df_easy = pd.concat([split_rows(row) for _, row in df_easy.iterrows()], ignore_index=True)
    df_easy = df_easy[df_easy['split'].notna()]
    
    df_easy['typoglycemia'] = df_easy['split'].apply(convert_numbers_to_words)
    df_easy = get_typoglycemia_modified_data(df_easy)
    df_easy.reset_index(inplace=True, drop=True)
    df_easy.to_csv("../data/processed/sscorpus_easy2.csv", index=False)

    return df_hard, df_easy

In [10]:
df_hard, df_easy = generate_typoglycemia_data_file(0.7, "../data/raw/sscorpus.gz")

(492993, 3)
(307038, 3)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_rows['split'].replace('', np.nan, inplace=True)
  new_rows['split'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_rows['split'].replace('', np.nan, inplace=True)


In [12]:
df_hard.shape

(334954, 3)