# NLP Project

Urdu to Roman Urdu Transliterator


Help Reference:
https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt

### This notebook contains the data cleaning and tokenization for our models

In [1]:
# imports
from sklearn.model_selection import train_test_split
import re
import string
import unicodedata
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
import os
import pickle

In [2]:
# colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# load datasets
with open('/content/drive/MyDrive/NLP Project/Urdu.txt', 'r') as f:
  urdu = f.read()
  urdu = urdu.split('\n')

with open('/content/drive/MyDrive/NLP Project/Roman-Urdu.txt', 'r') as f:
  roman_urdu = f.read()
  roman_urdu = roman_urdu.split('\n')

In [4]:
# only using 60% of the data due to computational constraints
urdu, discard_urdu, roman_urdu, discard_roman = train_test_split(urdu, roman_urdu, test_size=0.4, random_state=42)

In [5]:
# split dataset to train-val-test set with ratio of 90%, 10%
train_urdu, test_urdu, train_roman, test_roman = train_test_split(urdu, roman_urdu, test_size=0.1, random_state=42)

In [6]:
# get a look at the sizes of each dataset
print(len(urdu), len(roman_urdu), len(train_urdu), len(train_roman), len(test_urdu), len(test_roman))
# an example of urdu and its transliteration to roman urdu
print(train_urdu[10])
print(train_roman[10])

664293 664293 597863 597863 66430 66430
پھر وہ کلہاڑی ساتھ لے
phir woh kulhari sath le


In [7]:
# preprocess data

# convert unicode representation to ascii - function as its taken directly from website
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

train_roman = [unicode_to_ascii(sentence) for sentence in train_roman]
train_urdu = [unicode_to_ascii(sentence) for sentence in train_urdu]
test_roman = [unicode_to_ascii(sentence) for sentence in test_roman]
test_urdu = [unicode_to_ascii(sentence) for sentence in test_urdu]

# add white space between punctuation marks so that tokenization is better
train_roman = [re.sub(f'([{string.punctuation}])', r" \1 ", sentence) for sentence in train_roman]
train_roman = [re.sub(r'[" "]+', " ", sentence) for sentence in train_roman]

train_urdu = [re.sub(f'([{string.punctuation}])', r" \1 ", sentence) for sentence in train_urdu]
train_urdu = [re.sub(r'[" "]+', " ", sentence) for sentence in train_urdu]

test_roman = [re.sub(f'([{string.punctuation}])', r" \1 ", sentence) for sentence in test_roman]
test_roman = [re.sub(r'[" "]+', " ", sentence) for sentence in test_roman]

test_urdu = [re.sub(f'([{string.punctuation}])', r" \1 ", sentence) for sentence in test_urdu]
test_urdu = [re.sub(r'[" "]+', " ", sentence) for sentence in test_urdu]

# change everything that isnt alphanumeric (or alphanumeric in urdu) or isnt in string.punctuation as space
train_roman = [re.sub(r"[^a-zA-Z0-9{}]+".format(re.escape(string.punctuation)), " ", sentence) for sentence in train_roman] 
test_roman = [re.sub(r"[^a-zA-Z0-9{}]+".format(re.escape(string.punctuation)), " ", sentence) for sentence in test_roman]

urdu_alphabet = [
    '\u0627', '\u0622', '\u0628', '\u067E', '\u062A', '\u0679', '\u062B',
    '\u062C', '\u0686', '\u062D', '\u062E', '\u062F', '\u0688', '\u0630',
    '\u0631', '\u0691', '\u0632', '\u0698', '\u0633', '\u0634', '\u0635',
    '\u0636', '\u0637', '\u0638', '\u0639', '\u063A', '\u0641', '\u0642',
    '\u06A9', '\u06AF', '\u0644', '\u0645', '\u0646', '\u06BA', '\u0648',
    '\u06C1', '\u06BE', '\u0621', '\u06CC', '\u0626', '\u0624', '\u0649',
    '\u06D2', '\u0651', '\u0670'
]

urdu_digits = [
    '\u0660', '\u0661', '\u0662', '\u0663', '\u0664',
    '\u0665', '\u0666', '\u0667', '\u0668', '\u0669'
]

train_urdu = [re.sub(r"[^{}{}{}a-zA-Z0-9]+".format(''.join(urdu_alphabet), ''.join(urdu_digits), re.escape(string.punctuation)), " ", sentence) for sentence in train_urdu]
test_urdu = [re.sub(r"[^{}{}{}a-zA-Z0-9]+".format(''.join(urdu_alphabet), ''.join(urdu_digits), re.escape(string.punctuation)), " ", sentence) for sentence in test_urdu]

# remove trailing and leading whitespace
train_roman = [sentence.strip() for sentence in train_roman]
train_urdu = [sentence.strip() for sentence in train_urdu]
test_roman = [sentence.strip() for sentence in test_roman]
test_urdu = [sentence.strip() for sentence in test_urdu]

# add a start and eos token to each sentence
train_roman = ['<start> ' + sentence + ' <end>' for sentence in train_roman]
train_urdu = ['<start> ' + sentence + ' <end>' for sentence in train_urdu]
test_roman = ['<start> ' + sentence + ' <end>' for sentence in test_roman]
test_urdu = ['<start> ' + sentence + ' <end>' for sentence in test_urdu]

In [8]:
# the same example data after preprocessing
print(train_roman[10])
print(train_urdu[10])
print(type(train_roman[10]))

<start> phir woh kulhari sath le <end>
<start> پھر وہ کلہاڑی ساتھ لے <end>
<class 'str'>


In [9]:

# make buckets for the different length sentences as done in the paper: 0-10, 10-20, 20-30, 30-40 and > 40
bucket_ranges = [10, 20, 30, 40]
train_buckets = defaultdict(list)
# go over each example in train_roman, train_urdu. if the length of both is < the boundary range then add that example to that range's bucket, else move on
for i in range(len(train_roman)):
  for j, bucket_range in enumerate(bucket_ranges):
    if (len(train_roman[i].split(" ")) <= bucket_range) and (len(train_urdu[i].split(" ")) <= bucket_range):
      train_buckets[j].append((train_roman[i], train_urdu[i]))
      break
    # lastly, if u reach the end of the range and u still don't break from this inner loop then add a new bucket for length > 40 and add the example in this bucket
    if bucket_range == 40:
      train_buckets[j + 1].append((train_roman[i], train_urdu[i]))
    
# do the same for test data
test_buckets = defaultdict(list)
for i in range(len(test_roman)):
  for j, bucket_range in enumerate(bucket_ranges):
    if (len(test_roman[i].split(" ")) <= bucket_range) and (len(test_urdu[i].split(" ")) <= bucket_range):
      test_buckets[j].append((test_roman[i], test_urdu[i]))
      break
    if bucket_range == 40:
      test_buckets[j + 1].append((test_roman[i], test_urdu[i]))

In [10]:
# now each bucket is a 2d list, where each sublist contains tuples of the form (roman_urdu_example, corresponding_urdu_example)
# as seen by these examples
print(train_buckets[0][1])
print(train_buckets[0][0])

('<start> musannif Ibn abi rbika ) <end>', '<start> مصنف ابن ابی شیبہ ) <end>')
('<start> behtareen dost - 2 , 662 baar <end>', '<start> بہترین دوست - 2 , 662 بار <end>')


In [11]:
# now we make the integer sequences to pass to our models
# make tokenizer with no filter and an out of vocab token
# fit on entire train_data
# then make sequences and add padding for each bucket
# do the same for urdu
tokenizer_roman = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_roman.fit_on_texts(train_roman)
tokenizer_urdu = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_urdu.fit_on_texts(train_urdu)

buckets_train_roman = []
buckets_train_urdu = []
for i in range(len(train_buckets)):
  roman_sentences_in_bucket = [train_buckets[i][j][0] for j in range(len(train_buckets[i]))]
  urdu_sentences_in_bucket = [train_buckets[i][j][1] for j in range(len(train_buckets[i]))]
  # make integer sequences
  roman_sequences = tokenizer_roman.texts_to_sequences(roman_sentences_in_bucket) 
  # pad the sequences
  roman_sequences = pad_sequences(roman_sequences, padding='post')
  # save the sequences for this bucket
  buckets_train_roman.append(roman_sequences)
  # do the same for urdu
  urdu_sequences = tokenizer_urdu.texts_to_sequences(urdu_sentences_in_bucket) 
  urdu_sequences = pad_sequences(urdu_sequences, padding='post')
  buckets_train_urdu.append(urdu_sequences)


# do the same for test data
buckets_test_roman = []
buckets_test_urdu = []
for i in range(len(test_buckets)):
  roman_sentences_in_bucket = [test_buckets[i][j][0] for j in range(len(test_buckets[i]))]
  urdu_sentences_in_bucket = [test_buckets[i][j][1] for j in range(len(test_buckets[i]))]
  # make integer sequences
  roman_sequences = tokenizer_roman.texts_to_sequences(roman_sentences_in_bucket) 
  # pad the sequences
  roman_sequences = pad_sequences(roman_sequences, padding='post')
  # save the sequences for this bucket
  buckets_test_roman.append(roman_sequences)
  # do the same for urdu
  urdu_sequences = tokenizer_urdu.texts_to_sequences(urdu_sentences_in_bucket) 
  urdu_sequences = pad_sequences(urdu_sequences, padding='post')
  buckets_test_urdu.append(urdu_sequences)

The output of this notebook is now in four 2d lists: 
*   buckets_train_roman
*   buckets_train_urdu
*   buckets_test_roman
*   buckets_test_urdu

where each bucket is a 2d list of size 5 for each of the buckets as given in the paper: "sentence length of 0-10, 10-20, 20-30, 30-40 and > 40". Each sub-list inside the buckets contains the embeddings / sequences of all the sentences of that size. So, for example, buckets_train_roman[0] contains all padded sequences of Roman Urdu sentences that have size <= 10.

In [12]:
# example
print(buckets_train_roman[0][0])
print(buckets_train_urdu[0][0])
# as seen in the output, each sentence has been converted into a padded array of integers.
buckets_train_roman[0]

[    2   570   302    26   150    17 15545   109     3     0]
[    2   591   309    61   154    17 16362   112     3     0]


array([[    2,   570,   302, ...,   109,     3,     0],
       [    2,  1439,   339, ...,     0,     0,     0],
       [    2,  2438,   960, ...,     0,     0,     0],
       ...,
       [    2,    23,   118, ...,    14,     3,     0],
       [    2,    30,    45, ...,  1497,     3,     0],
       [    2, 12537,    79, ...,     0,     0,     0]], dtype=int32)

In [13]:
# store the tokenizers as they are needed for the models
tokenizer_roman_string = pickle.dumps(tokenizer_roman)

with open('/content/drive/MyDrive/NLP Project/tokenizer_roman.pkl', 'wb') as f:
    f.write(tokenizer_roman_string)

tokenizer_urdu_string = pickle.dumps(tokenizer_urdu)

with open('/content/drive/MyDrive/NLP Project/tokenizer_urdu.pkl', 'wb') as f:
    f.write(tokenizer_urdu_string)

In [14]:
# store the test data as they are needed later on for the predictions
with open('/content/drive/MyDrive/NLP Project/test_buckets.pickle', 'wb') as f:
    pickle.dump(test_buckets, f)

In [15]:
# store the buckets onto Gdrive to be processed by the models

# make a new directory to keep organized 
new_directory = "cleaned_data"
parent_directory = "/content/drive/MyDrive/NLP Project/"

path = os.path.join(parent_directory, new_directory)
os.makedirs(path, exist_ok=True)

# write lists to csv
with open('/content/drive/MyDrive/NLP Project/cleaned_data/buckets_train_roman.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for row in buckets_train_roman:
        writer.writerow([','.join(map(str, sublist)) for sublist in row])

with open('/content/drive/MyDrive/NLP Project/cleaned_data/buckets_train_urdu.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for row in buckets_train_urdu:
        writer.writerow([','.join(map(str, sublist)) for sublist in row])

with open('/content/drive/MyDrive/NLP Project/cleaned_data/buckets_test_roman.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for row in buckets_test_roman:
        writer.writerow([','.join(map(str, sublist)) for sublist in row])

with open('/content/drive/MyDrive/NLP Project/cleaned_data/buckets_test_urdu.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for row in buckets_test_urdu:
        writer.writerow([','.join(map(str, sublist)) for sublist in row])

print('file writing completed!')
# fin.

file writing completed!
