In [1]:
import re
import string
import operator
import functools
import glob
import os
import matplotlib.colors as mc
import nltk
import numpy as np
import textstat
import unicodedata
import pandas as pd

import textstat
from lexicalrichness import LexicalRichness
from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag, map_tag
from nltk.tokenize import sent_tokenize

 # Feature Selection
from sklearn.feature_selection import VarianceThreshold

In [2]:
# The screen is cleaned
class ClearScreen():
    
    def __init__(self):
    
        self.clear = os.system('cls' if os.name=='nt' else 'clear')

In [3]:
def countFiles(path, extension="*.txt"): 
    
    # Construir el patrón de búsqueda
    patron = os.path.join(path, extension)
    
    # Obtener la lista de archivos que coinciden con el patrón
    archivos = glob.glob(patron)
    
    return len(archivos) 

In [4]:
class Segmentation:
    
    def __init__(self, text):
        
        self.text = text
        
    def sentSegmentation(self):
        
        return sent_tokenize(self.text)
    
    def paraSegmentation(self):
        
        texto = self.text.split("\n\n")
        
        return list(filter(bool, texto))      

In [5]:
file = open(os.path.join(os.getcwd(), "corpus", "suspicious-document00013.txt"), encoding= "utf-8-sig")

text = file.read()
        
file.close()

In [6]:
Seg = Segmentation(text)

sentSeg = Seg.sentSegmentation()

paraSeg = Seg.paraSegmentation()

In [7]:
print(len(sentSeg))
print(len(paraSeg))

65
11


In [8]:
for i, sentence in enumerate(sentSeg):
    print(f"Oracion: {i} --> Texto: {sentence}")

Oracion: 0 --> Texto: It took the man who could shoot Phantom Falls to find out, "Its bite is not severe, nor is
it ordinarily poisonous.
Oracion: 1 --> Texto: There may be an occasional exception to this rule; but beside the
bite of the mosquito, it is comparatively mild and harmless."
Oracion: 2 --> Texto: And again: "Gnats...in my way
of thinking, are much worse than the black fly or mosquito."
Oracion: 3 --> Texto: So says Murray.
Oracion: 4 --> Texto: Our observations
differ.
Oracion: 5 --> Texto: A thousand mosquitoes and as many gnats can bite me without leaving a mark, or having
any effect save the pain of the bite while they are at work.
Oracion: 6 --> Texto: But each bite of the black fly
makes a separate and distinct boil, that will not heal and be well in two months.
Oracion: 7 --> Texto: While fishing for brook trout in July last, I ran into a swarm of them on Moose River and got
badly bitten.
Oracion: 8 --> Texto: I had carelessly left my medicine behind.
Oracion: 9 --> T

In [9]:
for j, paragraph in enumerate(paraSeg):
    print(f"Parrafo: {j} --> Texto: {paragraph}")

Parrafo: 0 --> Texto: It took the man who could shoot Phantom Falls to find out, "Its bite is not severe, nor is
it ordinarily poisonous. There may be an occasional exception to this rule; but beside the
bite of the mosquito, it is comparatively mild and harmless." And again: "Gnats...in my way
of thinking, are much worse than the black fly or mosquito." So says Murray. Our observations
differ. A thousand mosquitoes and as many gnats can bite me without leaving a mark, or having
any effect save the pain of the bite while they are at work. But each bite of the black fly
makes a separate and distinct boil, that will not heal and be well in two months.
Parrafo: 1 --> Texto: While fishing for brook trout in July last, I ran into a swarm of them on Moose River and got
badly bitten. I had carelessly left my medicine behind. On the first of October the bites had
not ceased to be painful, and it was three months before they disappeared entirely. Frank Forester
says, in his Fish and Fishing, pa

In [10]:
def getfleshReadingEase(text):
    
    fleshReadingEase = 0.0
    
    fleshReadingEase = textstat.flesch_reading_ease(text)
    
    return fleshReadingEase


def gettypeToken(text):    
    tam = len(text)
    if tam <= 0:        
        text = 0    
    else:
        try:
            text = LexicalRichness(text)    
            text = text.ttr
        except ZeroDivisionError:
            text = 0    
    return text

In [11]:
datos = []

for i, value in enumerate(sentSeg):
    
    datos.append([i, value, getfleshReadingEase(value), gettypeToken(value)])

In [12]:
df = pd.DataFrame(datos, columns=["index", "text", "fleshReadingEase", "typeToken"])

In [13]:
df

Unnamed: 0,index,text,fleshReadingEase,typeToken
0,0,It took the man who could shoot Phantom Falls ...,74.53,0.909091
1,1,There may be an occasional exception to this r...,49.15,0.954545
2,2,"And again: ""Gnats...in my way\nof thinking, ar...",97.20,1.000000
3,3,So says Murray.,93.81,1.000000
4,4,Our observations\ndiffer.,9.21,1.000000
...,...,...,...,...
60,60,I suppose I have camped fifty times with peopl...,76.90,0.892857
61,61,Boots are the\nmost common resort.,90.77,1.000000
62,62,"But, when you place a boot-leg--or two of them...",81.97,0.956522
63,63,Just why it never occurs to people that a\nstu...,55.92,0.968750


In [14]:
data = []

for j, value in enumerate(paraSeg):
    
    data.append([i, value, getfleshReadingEase(value), gettypeToken(value)])

In [15]:
df2 = pd.DataFrame(data, columns=["index", "text", "fleshReadingEase", "typeToken"])

df2

Unnamed: 0,index,text,fleshReadingEase,typeToken
0,64,It took the man who could shoot Phantom Falls ...,81.83,0.714286
1,64,"While fishing for brook trout in July last, I ...",73.0,0.795699
2,64,"""Adirondack Murray"" gives extended directions ...",64.38,0.835052
3,64,It was published in Forest and Stream in the s...,76.35,0.594059
4,64,I have given some space to the insect question...,73.21,0.785714
5,64,And just here I will briefly tell how a young ...,87.05,0.888889
6,64,"For months, whenever we met, he would introduc...",86.03,0.715789
7,64,We went in a party of five--two old hunters an...,68.7,0.705882
8,64,I have often had occasion to observe that stub...,83.96,0.846154
9,64,"On the whole, Jean and the other youngsters be...",75.64,0.680952


In [None]:
#