# Data Preprocessing

This notebook contains code snippets we wrote for preprocessing our corpora before training our models

### Fixing words without spaces (Urdu only)

In [None]:
def get_words(filename):
    """Returns a list of words from filename"""
    words = []
    
    with open(filename, encoding="utf-8", errors='ignore') as text_file:
        for line in text_file:
            words.append(line)
            
    return words

def get_words_without_spaces():
    """Returns the list of words without spaces"""   
    file_1 = get_words("urdu_words_without_spaces_1.txt")
    file_2 = get_words("urdu_words_without_spaces_2.txt")
    file_3 = get_words("urdu_words_without_spaces_3.txt")    
    
    words_without_spaces = file_1 + file_2 + file_3            
    words_without_spaces = [line.strip() for line in words_without_spaces]
            
    return words_without_spaces

def get_words_with_spaces():
    """Returns the list of words with spaces"""    
    file_1 = get_words("urdu_words_with_spaces_1.txt")
    file_2 = get_words("urdu_words_with_spaces_2.txt")
    file_3 = get_words("urdu_words_with_spaces_3.txt")
    
    words_with_spaces = file_1 + file_2 + file_3            
    words_with_spaces = [line.strip() for line in words_with_spaces]
            
    return words_with_spaces

def fix_lines(input_file, output_file, words_without_spaces, words_with_spaces):
    """ Writes each sentence from the input file to the output file, fixing space-less words"""
    with open(input_file, encoding="utf-8", errors='ignore') as i_f,\
    open(output_file, "w", encoding="utf-8", errors='ignore') as o_f:
        i = 0
        words_modified = 0
        lines_modified = 0
        
        for sentence in i_f:
            index = 0
            flag = True
            for words in words_without_spaces:
                if words in sentence:
                    sentence = sentence.replace(words, words_with_spaces[index])  
                    words_modified += 1
                    flag = False                  

                index += 1
            o_f.write(sentence)

            if flag == False:
                lines_modified += 1

            if i % 100000 == 0:
                print("Processed {}00K lines...".format(int(i / 100000)))
            i += 1

In [None]:
words_without_spaces = get_words_without_spaces()
words_with_spaces = get_words_with_spaces()

In [None]:
fix_lines("urdu.txt", "urdu_fixed.txt", words_without_spaces, words_with_spaces)

### Removing punctuation marks

In [None]:
input_file = 'roman.txt'
output_file = 'roman_preprocessed.txt'

temp_file_a = 'temp_a.txt'
temp_file_b = 'temp_b.txt'
temp_file_c = 'temp_c.txt'

In [None]:
import string

with open(input_file, encoding='utf-8', errors='ignore') as i_f,\
    open(temp_file_a, mode='w', encoding='utf-8', errors='ignore') as o_f:
    for sentence in i_f:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        o_f.write(sentence)

### Removing digits

In [None]:
with open(temp_file_a, encoding='utf-8', errors='ignore') as i_f,\
    open(temp_file_b, 'w', encoding='utf-8', errors='ignore') as o_f:
    for sentence in i_f:
        o_f.write("".join([character for character in sentence if not character.isdigit()]))

### Replacing multiple whitespace characters with single whitespace

In [None]:
if 'roman' in input_file:
    with open(temp_file_b, encoding='utf-8', errors='ignore') as i_f,\
    open(temp_file_c, 'w', encoding='utf-8', errors='ignore') as o_f:
        for sentence in i_f:
            words = sentence.split()
            l = len(words)

            for i in range(l):
                if i == (l - 1):
                    o_f.write(words[i] + '\n')
                else:
                    o_f.write(words[i] + ' ')
else:
    with open(temp_file_b, encoding='utf-8', errors='ignore') as i_f,\
    open(output_file, 'w', encoding='utf-8', errors='ignore') as o_f:
        for sentence in i_f:
            words = sentence.split()
            l = len(words)

            for i in range(l):
                if i == (l - 1):
                    o_f.write(words[i] + '\n')
                else:
                    o_f.write(words[i] + ' ')

### Converting all characters to lowercase (Roman Urdu only)

In [None]:
with open(temp_file_c) as i_f, open(output_file, 'w') as o_f:
    for sentence in i_f:
        sentence = sentence.lower()
        o_f.write(sentence)