In [15]:
import os
import re
# import nltk
import string
from nltk import word_tokenize, pos_tag
import contractions
import pandas as pd

In [None]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [16]:
# Function to capitalize the first letter of each sentence and proper nouns
def capitalize_first_and_proper_nouns(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    result = []
    for i in range(len(tagged)):
        if i == 0:
            result.append(tagged[i][0].capitalize())
        elif tagged[i][1] == 'NNP' and tagged[i-1][0] == '.':
            result.append(tagged[i][0].capitalize())
        else:
            result.append(tagged[i][0])
    return ' '.join(result)

In [17]:
# Function to remove repeated punctuations
def remove_repeated_punctuations(text):
    pattern = r'([' + re.escape(string.punctuation) + r'])\1+'
    return re.sub(pattern, r'\1', text)

In [18]:
# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

In [19]:
# Define a tokenization function
def tokenize_sentences(sentences):
    return [word_tokenize(sentence) for sentence in sentences]

In [32]:
def fix_general_spacing(sentence):
    # Fix space before punctuation (like ' ,' to ',')
    sentence = re.sub(r'\s([,.?!:;])', r'\1', sentence)
    # Fix space after punctuation (like ' . ' to '. ')
    sentence = re.sub(r'([,.?!:;])\s', r'\1 ', sentence)
    # Fix space in contractions (like "don 't" to "don't")
    sentence = re.sub(r"\b(\w+)\s('t|'s|'m|'ll|'ve|'re|'d|n't)\b", r"\1\2", sentence)
    return sentence

In [33]:
def preprocess(text):
    # text = text.lower()
    text = capitalize_first_and_proper_nouns(text)
    text = expand_contractions(text)
    text = remove_repeated_punctuations(text)
    text = fix_general_spacing(text)
    return text

In [21]:
def read_and_pair_data (train_file_EM_informal, train_file_EM_formal):
    # Read the informal and formal sentences from the provided text files
    with open(train_file_EM_informal, 'r', encoding='utf-8') as file:
        informal_sentences = file.readlines()

    with open(train_file_EM_formal, 'r', encoding='utf-8') as file:
        formal_sentences = file.readlines()

    # Preprocess the data 
    informal_sentences = [preprocess(text) for text in informal_sentences]
    formal_sentences = [preprocess(text) for text in formal_sentences]

    # Create dataframes from the sentences lists
    df_informal = pd.DataFrame({'informal': informal_sentences})
    df_formal = pd.DataFrame({'formal': formal_sentences})

    # Strip whitespace from the beginning and end of sentences
    df_informal['informal'] = df_informal['informal'].str.strip()
    df_formal['formal'] = df_formal['formal'].str.strip()

    # Assuming that each line corresponds to a sentence pair, we can concatenate the dataframes
    df_paired = pd.concat([df_informal, df_formal], axis=1)

    return df_paired

In [38]:
# path to train data 
train_file_EM_informal = "./GYAFC_Corpus/Entertainment_Music/train/informal"
train_file_EM_formal = "./GYAFC_Corpus/Entertainment_Music/train/formal"
train_file_FR_informal = "./GYAFC_Corpus/Family_Relationships/train/informal"
train_file_FR_formal = "./GYAFC_Corpus/Family_Relationships/train/formal"

# Get preprocessed dataframes
train_df_EM_paired = read_and_pair_data(train_file_EM_informal, train_file_EM_formal)
train_df_FR_paired = read_and_pair_data(train_file_FR_informal, train_file_FR_formal)

# Tokenize both informal and formal sentences from Entertainment Music
train_df_EM_paired['informal_tokenized'] = tokenize_sentences(train_df_EM_paired['informal'])
train_df_EM_paired['formal_tokenized'] = tokenize_sentences(train_df_EM_paired['formal'])

# Tokenize both informal and formal sentences from Family Relationships
train_df_FR_paired['informal_tokenized'] = tokenize_sentences(train_df_FR_paired['informal'])
train_df_FR_paired['formal_tokenized'] = tokenize_sentences(train_df_FR_paired['formal'])



In [None]:
print(train_df_EM_paired.head())

In [None]:
print(train_df_FR_paired.head())