# **N-Gram Language Models Implementation**

For the 20N and BAC datasets, perform the processing required to build two N-Gram Language Models:



In [1]:
#I. Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.


import xml.etree.ElementTree as ET
import re

def get_sentences_BAC(path) -> list:
    """
    Reads the BAC corpus and returns a list of sentences.
    """
    sentences = []
    lines = []
    try:
        with open(path, encoding="utf8", errors='ignore') as f:
            lines = f.readlines()
        for line in lines:
            if not re.match( r'<.*>', line):
                sentences_proc = split_sentences(replace_number(normalize(line)))
                sentences = sentences+sentences_proc
    except Exception as e:
        print(f"{path} {str(e)}")
        
    return sentences


In [2]:
def normalize(text) -> str:
    """
    Normalizes a sentence by stem method.
    """
    from gensim.parsing.porter import PorterStemmer 

    p=PorterStemmer()
    sentences = p.stem_sentence(text)
    return sentences

In [3]:
# loop for all characters

def replace_number(text) -> str:
    """
    Replaces all numeric characters with a NUM.
    """
    sentences = re.sub(r'\d+', 'NUM', text)
    return sentences



In [4]:

def split_sentences(text) -> list:
    """
    Splits a text into sentences.
    """
    sentences = re.split(r'[.!?]', text)
    sentences = [f"<s>{s.strip()}</s>" for s in sentences if s.strip() != ""]
    return sentences

In [5]:
def get_senteces_from_path(path)->list:
    import os
    sentences = []
    for filename in sorted(os.listdir(path)):
        if filename.endswith(".xml"):
            sentences = sentences + get_sentences_BAC(path+filename)
    return sentences
    


In [None]:
path = 'Datasets/BAC/blogs/'

s = get_senteces_from_path(path)
print (len(s))

In [None]:
# save list in a file 
with open('salida/corpus_BAC.txt', 'w') as f:
    for item in s:
        f.write("%s\n" % item)

In [6]:
def get_sentences_N20(path) -> list:
    """
    Reads the N20 corpus and returns a list of sentences.
    """
    sentences = []
    lines = []
    try:
        with open(path, encoding="utf8", errors='ignore') as f:
            lines = f.readlines()
        for line in lines:
            sentences_proc = split_sentences(replace_number(normalize(line)))
            sentences = sentences+sentences_proc
    except Exception as e:
        print(f"{path} {str(e)}")
        
    return sentences

In [7]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'


def get_senteces_from_path_20N(path)->list:
    import os
    sentences = []
    for dirs in sorted(os.listdir(path)):
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            for filename in sorted(os.listdir(tmpdir)):                
                sentences = sentences + get_sentences_N20(tmpdir+filename)    
    return sentences
    


s_20N = get_senteces_from_path_20N(path)   
len(s_20N)

1046621

In [8]:
# save list in a file 
with open('salida/corpus_20N.txt', 'w') as f:
    for item in s_20N:
        f.write("%s\n" % item)