# **N-Gram Language Models Implementation**

For the 20N and BAC datasets, perform the processing required to build two N-Gram Language Models:



In [1]:
#I. Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

#20Newsgroups

import xml.etree.ElementTree as ET
import re

def get_sentences_BAC(path) -> list:
    """
    Reads the BAC corpus and returns a list of sentences.
    """
    sentences = []
    lines = ""
    try:

        with open(path, encoding="utf8", errors='ignore') as f:
            lines = f.read()
        lines=lines.replace('&', '')
        lines=lines.replace('<', '')
        lines=lines.replace('>', '')
        root = ET.fromstring(lines)
        for node in root.iter('*'):
            if node.tag == 'post':
                sentences_proc = split_sentences(replace_number(normalize(node.text)))
                sentences = sentences+sentences_proc
    except Exception as e:
        print(f"{path} {str(e)}")
        
    return sentences


In [2]:
def normalize(text) -> str:
    """
    Normalizes a sentence by stem method.
    """
    from gensim.parsing.porter import PorterStemmer 

    p=PorterStemmer()
    sentences = p.stem_sentence(text)
    return sentences

In [3]:
# loop for all characters

def replace_number(text) -> str:
    """
    Replaces all numeric characters with a NUM.
    """
    sentences = re.sub(r'\d+', 'NUM', text)
    return sentences



In [4]:

def split_sentences(text) -> list:
    """
    Splits a text into sentences.
    """
    sentences = re.split(r'[.!?]', text)
    sentences = [f"<s>{s.strip()}</s>" for s in sentences if s.strip() != ""]
    return sentences

In [5]:
def get_senteces_from_path(path)->list:
    import os
    sentences = []
    for filename in sorted(os.listdir(path)):
        if filename.endswith(".xml"):
            sentences = sentences + get_sentences_BAC(path+filename)
    return sentences
    


In [6]:
path = 'Datasets/BAC/blogs/'

s = get_senteces_from_path(path)
print (len(s))

Datasets/BAC/blogs/1022086.female.17.Student.Cancer.xml not well-formed (invalid token): line 1060, column 130
Datasets/BAC/blogs/1032824.female.15.Student.Libra.xml not well-formed (invalid token): line 1150, column 833
Datasets/BAC/blogs/1046946.female.25.Arts.Virgo.xml not well-formed (invalid token): line 6348, column 267
Datasets/BAC/blogs/105748.female.26.Student.Scorpio.xml not well-formed (invalid token): line 2509, column 171
Datasets/BAC/blogs/1063313.female.16.Student.Libra.xml not well-formed (invalid token): line 96, column 1
Datasets/BAC/blogs/1070540.female.36.Technology.Scorpio.xml not well-formed (invalid token): line 381, column 1193
Datasets/BAC/blogs/108212.female.27.Student.Taurus.xml not well-formed (invalid token): line 2495, column 171
Datasets/BAC/blogs/1084668.female.15.Student.Capricorn.xml not well-formed (invalid token): line 1357, column 1015
Datasets/BAC/blogs/1089670.female.17.Student.Sagittarius.xml not well-formed (invalid token): line 88, column 572
D

In [None]:
import tarfile

tar = tarfile.open("datos/20news-19997.tar.gz", "r:gz")
for member in tar.getmembers():
     f = tar.extractfile(member)
     if f is not None:
         content = f.read()

In [None]:
#read tar gz file
def read_tar_gz(tar_gz_file):
    tar = tarfile.open(tar_gz_file, "r:gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        if f is not None:
            content = f.read()
            yield content
            f.close()
            tar.close()