## Import the necessary libraries

In [25]:
import pandas as pd
import re
import string
import numpy as np
from collections import Counter
import pickle

### Import the corpus     

In [26]:
with open('./Data/Pride and Prejudice - Jane Austen.txt', 'r') as f:
    pride_prejudice = f.read()

with open('./Data/Ulysses - James Joyce.txt', 'r') as f:
    ulysses = f.read()

# text= ' In Pride and Prejudice by Jane Austen, Elizabeth Bennett meets Mr Darcy at a ball hosted by her friend @charles_bingly. They dance, but Mr Darcy finds her behaviour "tolerable, but not handsome enough to tempt him" #rude. She later visits Pemberley, Mr Darcys estate, where she learns more about his character. Check out more information at https://janeausten.co.uk.'
# print(text)

### Corpus cleaner and tokenizer

In [27]:
def clean_corpus(text):

    hashtags = re.findall(r'#[a-zA-Z0-9_]+', text)
    mentions = re.findall(r'@[a-zA-Z0-9_]+', text)
    urls = re.findall(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    emails = re.findall(
        r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text)
    urls.extend(re.findall(r'www\.[a-z0-9]+.[a-z]+', text))

    print("> URLS : ", urls)
    print("> HASHTAGS : ", hashtags)
    print("> MENTIONS : ", mentions)
    print("> EMAILS : ", emails)

    # convert to lower case
    text = text.lower()

    # regex to find the URLs starting with "http" and replace them with "<URL>"
    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '<URL>', text)

    # regex to find the URLs starting with "www" and replace them with "<URL>"
    text = re.sub(r'www\.[a-z0-9]+.[a-z]+', '<URL>', text)

    # convert email addresses to <EMAIL>
    text = re.sub(
        r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '<EMAIL>', text)

    # regex to find all the mentions with @ and replace them with "<MENTION>"
    text = re.sub(r'@[a-zA-Z0-9_]+', '<MENTION>', text)

    # regex to find all the hashtags with # and replace them with "<HASHTAG>"
    text = re.sub(r'#[a-zA-Z0-9_]+', '<HASHTAG>', text)

    # regex to convert "Chapter 1" to "<CHAPTER>"
    text = re.sub(r'Chapter \d+', '<CHAPTER>', text)
    # there are some words which have underscores in the starting and ending, replacing them with just the word
    # remove the underscores from the whole text
    text = re.sub(r'_', '', text)

    # regex to convert all the dates to "<DATE>" of the format dd/mm/yyyy
    text = re.sub(r'\d{1,2}\/\d{1,2}\/\d{2,4}', '<DATE>', text)

    # regex to convert all the dates in the format of "<Date> <Month> <Year>" to "<DATE>"
    text = re.sub(r'\d{1,2} [a-zA-Z]{3,9}  \d{2,4}', '<DATE>', text)

    # regex to convert all the dates in the format of "<Month> <Date>  <Year>" to "<DATE>"
    text = re.sub(r'[a-zA-Z]{3,9} \d{1,2}  \d{2,4}', '<DATE>', text)

    # regex to remove Mr. and Mrs.
    text = re.sub(r'Mr\.|Mrs\.', '', text)

    # replace \n with space
    text = re.sub(r'\n', ' ', text)

    punctuations_to_be_retained = ['#', '@', '<', '>', "'", '"']
    punctuations_to_be_removed = ''.join(
        [p for p in string.punctuation if p not in punctuations_to_be_retained])
    text = re.sub(r'['+punctuations_to_be_removed+']', r' \g<0> ', text)

    text = re.sub(r'\'|\"', '', text) # remove ' and "
    text = re.sub(r'\s+', ' ', text) # remove extra empty lines

    return text
    # print(text)


### Cleaning both the corupus using the function above

In [28]:
pride_prejudice_clean = clean_corpus(pride_prejudice)
ulysses_clean= clean_corpus(ulysses)

print("Number of characters in Pride and Prejudice: ", len(pride_prejudice_clean))

> URLS :  ['http://www.gutenberg.org/dirs/4/2/6/7/42671', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org']
> HASHTAGS :  []
> MENTIONS :  ['@pglaf']
> EMAILS :  ['gbnewby@pglaf.org']
> URLS :  ['https://www.gutenberg.org/4/3/0/4300/', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org', 'www.gutenberg.org']
> HASHTAGS :  []
> MENTIONS :  []
> EMAILS :  []
Number of characters in Pride and Prejudice:  719354


### Setting aside 1000 <u>random</u> sentences from both the corpora for testing

In [29]:
def convert_to_sentences(data):
    # split the text into sentences
    sentences = re.split(r' *[\.\?!][\'"\)\]]* *', data)
    sentences = [s.strip() for s in sentences if len(s) > 0]
    sentences = [s for s in sentences if len(s.split()) > 1]
    # sentences=['<START> <START> <START> '+s+' <END>' for s in sentences]
    
    return sentences

In [30]:
def random_split(sentences):
    test=[]
    test=np.random.choice(sentences, 1000 , replace=False).tolist()
    train=[s for s in sentences if s not in test]
    return(train,test)

In [31]:
def create_vocab(sentences):
    all_words = []
    for s in sentences:
        all_words.extend(s.split())
        
    counted = Counter(all_words)
    return counted

In [32]:
cleaned_sentences_pride_prejudice = convert_to_sentences(pride_prejudice_clean)
cleaned_sentences_ulysses = convert_to_sentences(ulysses_clean)

print("Number of sentences in Pride and Prejudice: ", len(cleaned_sentences_pride_prejudice))

# sentences_pride_prejudice=convert_to_sentences(pride_prejudice)
# sentences_ulysses=convert_to_sentences(ulysses)


train_pride_prejudice,test_pride_prejudice=random_split(cleaned_sentences_pride_prejudice)
train_ulysses,test_ulysses=random_split(cleaned_sentences_ulysses)


# vocab_pride_prejudice, train_pride_prejudice=create_vocab(train_pride_prejudice)
# vocab_ulysse, train_ulysses=create_vocab(train_ulysses)
vocab_pride_prejudice=create_vocab(train_pride_prejudice)
vocab_ulysses=create_vocab(train_ulysses)


print("PRIDE & PREJUDICE VOCAB : ", len(vocab_pride_prejudice))
print("ULYSSES VOCAB : ", len(vocab_ulysses))


Number of sentences in Pride and Prejudice:  6951
PRIDE & PREJUDICE VOCAB :  6267
ULYSSES VOCAB :  29764


In [33]:
# print(vocab_pride_prejudice)

In [34]:
# save the vocab, cleaned sentences and test sentences to a file
with open('./Data/cleaned_pride_prejudice.pkl', 'wb') as f:
    pickle.dump(cleaned_sentences_pride_prejudice, f)
    
with open('./Data/cleaned_ulysses.pkl', 'wb') as f:
    pickle.dump(cleaned_sentences_ulysses, f)

with open('./Data/vocab_pride_prejudice.pkl', 'wb') as f:
    pickle.dump(vocab_pride_prejudice, f)

with open('./Data/train_pride_prejudice.pkl', 'wb') as f:
    pickle.dump(train_pride_prejudice, f)
    
with open('./Data/test_pride_prejudice.pkl', 'wb') as f:
    pickle.dump(test_pride_prejudice, f)

with open('./Data/vocab_ulysses.pkl', 'wb') as f:
    pickle.dump(vocab_ulysses, f)    
    
with open('./Data/train_ulysses.pkl', 'wb') as f:
    pickle.dump(train_ulysses, f)
    
with open('./Data/test_ulysses.pkl', 'wb') as f:   
    pickle.dump(test_ulysses, f)
    