In [1]:
import os
import re
import nltk
import string
from nltk import word_tokenize, pos_tag
import contractions
import pandas as pd
import spacy

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/sg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Load spaCy model outside of the function to avoid reloading it each time the function is called
nlp = spacy.load("en_core_web_sm")

In [5]:
# Function to capitalize the first letter of each sentence and proper nouns
def capitalize_first_and_proper_nouns(text):
    # Process the text using spaCy to create a Doc object
    doc = nlp(text)

    result = []

    # Iterate over the sentences in the Doc
    for sent in doc.sents:
        # Iterate over the tokens in the sentence
        for token in sent:
            # Capitalize the first letter of each sentence and proper nouns
            if token.is_sent_start or token.pos_ == 'PROPN':
                result.append(token.text.capitalize())
            else:
                result.append(token.text)

    # Rejoin the tokens into a single string
    return ' '.join(result)

In [6]:
# Example usage
input_text = "this is a sample sentence. john and mary went to the park. the park was beautiful."
result_text = capitalize_first_and_proper_nouns(input_text)
print(result_text)

This is a sample sentence . John and Mary went to the park . The park was beautiful .


In [7]:
# Function to remove repeated punctuations
def remove_repeated_punctuations(sentence):
    # Use regular expression to remove consecutive repeated punctuations
    cleaned_sentence = re.sub(r'(\W)\1+', r'\1', sentence)
    return cleaned_sentence

# Example usage:
sentence1 = "Hello!!! How are you?? I hope you''re doing well....."
cleaned_sentence1 = remove_repeated_punctuations(sentence1)
print(cleaned_sentence1)

Hello! How are you? I hope you're doing well.


In [8]:
# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

In [9]:
# text = "I'm a student and I've a test tomorrow."
text = "Sure,, I'm a student and it's ok, but I always have let the guy ask me...."
expanded_text = expand_contractions(text)
print(expanded_text) 

Sure,, I am a student and it is ok, but I always have let the guy ask me....


In [10]:
# Define a tokenization function
def tokenize_sentences(sentences):
    return [word_tokenize(sentence) for sentence in sentences]

In [11]:
def fix_general_spacing(sentence):
    # Fix space before punctuation (like ' ,' to ',')
    sentence = re.sub(r'\s([,.?!:;])', r'\1', sentence)
    # Fix space after punctuation (like ' . ' to '. ')
    sentence = re.sub(r'([,.?!:;])\s', r'\1 ', sentence)
    # Fix space in contractions (like "don 't" to "don't")
    sentence = re.sub(r"\b(\w+)\s('t|'s|'m|'ll|'ve|'re|'d|n't)\b", r"\1\2", sentence)
    # Reduce multiple spaces between words to a single space
    sentence = re.sub(r'\s{2,}', ' ', sentence)
    return sentence

In [12]:
def preprocess(text):
    # text = text.lower()
    text = expand_contractions(text)
    text = remove_repeated_punctuations(text)
    text = capitalize_first_and_proper_nouns(text)
    text = fix_general_spacing(text)
    return text

In [13]:
# text = "I'm a student and I've a test tomorrow."
text = "Sure,, I'm a student and it's ok, but I always have let the guy ask me...."
expanded_text = preprocess(text)
print(expanded_text) 

Sure, I am a student and it is ok, but I always have let the guy ask me.


In [14]:
def read_and_pair_data (train_file_EM_informal, train_file_EM_formal):
    # Read the informal and formal sentences from the provided text files
    with open(train_file_EM_informal, 'r', encoding='utf-8') as file:
        informal_sentences = file.readlines()

    with open(train_file_EM_formal, 'r', encoding='utf-8') as file:
        formal_sentences = file.readlines()

    # Preprocess the data 
    informal_sentences = [preprocess(text) for text in informal_sentences]
    formal_sentences = [preprocess(text) for text in formal_sentences]

    # Create dataframes from the sentences lists
    df_informal = pd.DataFrame({'informal': informal_sentences})
    df_formal = pd.DataFrame({'formal': formal_sentences})

    # Strip whitespace from the beginning and end of sentences
    df_informal['informal'] = df_informal['informal'].str.strip()
    df_formal['formal'] = df_formal['formal'].str.strip()

    # Assuming that each line corresponds to a sentence pair, we can concatenate the dataframes
    df_paired = pd.concat([df_informal, df_formal], axis=1)

    return df_paired

In [15]:
# path to train data 
train_file_EM_informal = "./GYAFC_Corpus/Entertainment_Music/train/informal"
train_file_EM_formal = "./GYAFC_Corpus/Entertainment_Music/train/formal"
train_file_FR_informal = "./GYAFC_Corpus/Family_Relationships/train/informal"
train_file_FR_formal = "./GYAFC_Corpus/Family_Relationships/train/formal"

# Get preprocessed dataframes
train_df_EM_paired = read_and_pair_data(train_file_EM_informal, train_file_EM_formal)
train_df_FR_paired = read_and_pair_data(train_file_FR_informal, train_file_FR_formal)

# Tokenize both informal and formal sentences from Entertainment Music
train_df_EM_paired['informal_tokenized'] = tokenize_sentences(train_df_EM_paired['informal'])
train_df_EM_paired['formal_tokenized'] = tokenize_sentences(train_df_EM_paired['formal'])

# Tokenize both informal and formal sentences from Family Relationships
train_df_FR_paired['informal_tokenized'] = tokenize_sentences(train_df_FR_paired['informal'])
train_df_FR_paired['formal_tokenized'] = tokenize_sentences(train_df_FR_paired['formal'])


KeyboardInterrupt: 

In [None]:
print(train_df_EM_paired.head())

                                            informal  \
0  The movie The In - Laws not exactly a holiday ...   
1        That page did not give me viroses(i think )   
2  Of corse i be wachin It Evry day, my fav chara...   
3  Runescape.com ( my kids love it ) & funbrain.c...   
4  Is he Gay?he was on Late Night with Conan O'br...   

                                              formal  \
0  The In - Laws movie is not a holiday movie, bu...   
1          I do not think that page gave me viruses.   
2  I watch it everyday, my favorite charachter is...   
3  Funbrain.com and runescape.com are great for f...   
4  He was on the Late Night show with Conan O'bri...   

                                  informal_tokenized  \
0  [The, movie, The, In, -, Laws, not, exactly, a...   
1  [That, page, did, not, give, me, viroses, (, i...   
2  [Of, corse, i, be, wachin, It, Evry, day, ,, m...   
3  [Runescape.com, (, my, kids, love, it, ), &, f...   
4  [Is, he, Gay, ?, he, was, on, Late, Night, 

In [None]:
print(train_df_FR_paired.head())

                                            informal  \
0  Sure, it is ok, but I always have let the guy ...   
1  Hmmm, I am a guy suffering from verbal abuse f...   
2        You will have more friends that you want.;)   
3  It is nice, you get to see pictures of who you...   
4                           I NEED TO KNOW WHAT 2 DO   

                                              formal  \
0                    I prefer to let the guy ask me.   
1        I suffer through verbal abuse from my wife.   
2          You will have more friends than you want.   
3  It is nice that you get to see pictures of who...   
4                         I need to know what to do.   

                                  informal_tokenized  \
0  [Sure, ,, it, is, ok, ,, but, I, always, have,...   
1  [Hmmm, ,, I, am, a, guy, suffering, from, verb...   
2  [You, will, have, more, friends, that, you, wa...   
3  [It, is, nice, ,, you, get, to, see, pictures,...   
4                   [I, NEED, TO, KNOW, WHAT, 

In [None]:
print(train_df_EM_paired['formal_tokenized'][0])

['The', 'In', '-', 'Laws', 'movie', 'is', 'not', 'a', 'holiday', 'movie', ',', 'but', 'it', 'is', 'okay', '.']


# Creating JSON file for the training data

In [16]:
import json
train_ds = list()

In [17]:
f1 = open("./GYAFC_Corpus/Family_Relationships/train/informal", 'r')
f2 = open("./GYAFC_Corpus/Family_Relationships/train/formal", 'r')

id = 0

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  if not line1:
    break

  rule_based_preprocessed1 = preprocess(line1)

  train_ds.append(
      {
          'id':id,
          'topic':'Family_Relationships',
          'transformation':{
              'informal':rule_based_preprocessed1,
              'formal.ref0':line2,
              'formal.ref1':"",
              'formal.ref2':"",
              'formal.ref3':"",
          }
      }
  )  # adding a row

  id += 1

KeyboardInterrupt: 

In [None]:
f1 = open("./GYAFC_Corpus/Entertainment_Music/train/informal", 'r')
f2 = open("./GYAFC_Corpus/Entertainment_Music/train/formal", 'r')

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  if not line1:
    break

  rule_based_preprocessed1 = preprocess(line1)

  train_ds.append(
      {
          'id':id,
          'topic':'Entertainment_Music',
          'transformation':{
              'informal':rule_based_preprocessed1,
              'formal.ref0':line2,
              'formal.ref1':"",
              'formal.ref2':"",
              'formal.ref3':"",
          }
      }
  )

  id += 1

In [19]:
val_ds = list()


In [20]:
f1 = open("./GYAFC_Corpus/Family_Relationships/tune/informal", 'r')
f2 = open("./GYAFC_Corpus/Family_Relationships/tune/formal.ref0", 'r')
f3 = open("./GYAFC_Corpus/Family_Relationships/tune/formal.ref1", 'r')
f4 = open("./GYAFC_Corpus/Family_Relationships/tune/formal.ref2", 'r')
f5 = open("./GYAFC_Corpus/Family_Relationships/tune/formal.ref3", 'r')

id = 0

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  line3 = f3.readline().rstrip()
  line4 = f4.readline().rstrip()
  line5 = f5.readline().rstrip()
  if not line1:
    break
  
  rule_based_preprocessed_val = preprocess(line1)

  val_ds.append(
      {
          'id':id,
          'topic':'Family_Relationships',
          'transformation':{
              'informal':rule_based_preprocessed_val,
              'formal.ref0':line2,
              'formal.ref1':line3,
              'formal.ref2':line4,
              'formal.ref3':line5,
          }
      })  # adding a row
  id += 1

In [21]:
f1 = open("./GYAFC_Corpus/Entertainment_Music/tune/informal", 'r')
f2 = open("./GYAFC_Corpus/Entertainment_Music/tune/formal.ref0", 'r')
f3 = open("./GYAFC_Corpus/Entertainment_Music/tune/formal.ref1", 'r')
f4 = open("./GYAFC_Corpus/Entertainment_Music/tune/formal.ref2", 'r')
f5 = open("./GYAFC_Corpus/Entertainment_Music/tune/formal.ref3", 'r')

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  line3 = f3.readline().rstrip()
  line4 = f4.readline().rstrip()
  line5 = f5.readline().rstrip()
  if not line1:
    break
  
  rule_based_preprocessed_val = preprocess(line1)

  val_ds.append(
      {
          'id':id,
          'topic':'Entertainment_Music',
          'transformation':{
              'informal':rule_based_preprocessed_val,
              'formal.ref0':line2,
              'formal.ref1':line3,
              'formal.ref2':line4,
              'formal.ref3':line5,
          }
      }) # adding a row
  id += 1

In [22]:
with open("./data_val_preprocessed.json", 'w') as f:
  json.dump(val_ds, f, ensure_ascii=False)

In [23]:
test_ds = list()


In [24]:
f1 = open("./GYAFC_Corpus/Family_Relationships/test/informal", 'r')
f2 = open("./GYAFC_Corpus/Family_Relationships/test/formal.ref0", 'r')
f3 = open("./GYAFC_Corpus/Family_Relationships/test/formal.ref1", 'r')
f4 = open("./GYAFC_Corpus/Family_Relationships/test/formal.ref2", 'r')
f5 = open("./GYAFC_Corpus/Family_Relationships/test/formal.ref3", 'r')

id = 0

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  line3 = f3.readline().rstrip()
  line4 = f4.readline().rstrip()
  line5 = f5.readline().rstrip()
  if not line1:
    break
  rule_based_preprocessed_test = preprocess(line1)
  
  test_ds.append(
      {
          'id':id,
          'topic':'Family_Relationships',
          'transformation':{
              'informal':rule_based_preprocessed_test,
              'formal.ref0':line2,
              'formal.ref1':line3,
              'formal.ref2':line4,
              'formal.ref3':line5,
          }
      })  # adding a row
  id += 1

In [None]:
f1 = open("./GYAFC_Corpus/Entertainment_Music/test/informal", 'r')
f2 = open("./GYAFC_Corpus/Entertainment_Music/test/formal.ref0", 'r')
f3 = open("./GYAFC_Corpus/Entertainment_Music/test/formal.ref1", 'r')
f4 = open("./GYAFC_Corpus/Entertainment_Music/test/formal.ref2", 'r')
f5 = open("./GYAFC_Corpus/Entertainment_Music/test/formal.ref3", 'r')

while True:
  line1 = f1.readline().rstrip()
  line2 = f2.readline().rstrip()
  line3 = f3.readline().rstrip()
  line4 = f4.readline().rstrip()
  line5 = f5.readline().rstrip()
  if not line1:
    break
  
  rule_based_preprocessed_test = preprocess(line1)

  test_ds.append(
      {
          'id':id,
          'topic':'Entertainment_Music',
          'transformation':{
              'informal':line1,
              'formal.ref0':line2,
              'formal.ref1':line3,
              'formal.ref2':line4,
              'formal.ref3':line5,
          }
      })  # adding a row
  id += 1

In [None]:
with open("./data_test_preprocessed.json", 'w') as f:
  json.dump(test_ds, f, ensure_ascii=False)