In [3]:
#!pip install nltk
import pandas as pd
from langdetect import detect
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [5]:
class DatasetController:
    
    def clean_dataset(self, data):

        data_poemas = data.copy()
        # Elimina los poemas que no posean Tags (sean nulos)
        data_poemas = data_poemas.dropna(subset=['Tags'])

        # Genera la columna 'titulo'
        data_poemas['titulo'] = data_poemas['Title'].apply(lambda x: x.replace('\r', '').replace('\n', '').strip())

        # Genera la columna 'poema'
        data_poemas['poema'] = data_poemas['Poem'].apply(lambda x: x.replace('\r', '').replace('\n', '').strip())

        # Filtra los poemas que sean demasiado cortos (threshold > 15 caracteres)
        data_poemas = data_poemas[data_poemas['poema'].apply(lambda x: len(x) > 15)]

        # Reemplaza títulos no válidos por 'Desconocido'
        data_poemas['titulo'] = data_poemas['titulo'].apply(lambda x: 'Desconocido' if len(x) < 2 else x)

        # Genera la columna 'poeta'
        data_poemas['poeta'] = data_poemas['Poet']

        # Genera la columna 'tags'
        data_poemas['tags'] = data_poemas['Tags']

        # Genera la columna 'language'
        data_poemas['language'] = data_poemas['poema'].apply(lambda x: detect(x))

        # Selecciona las columnas deseadas
        dataset = data_poemas[['titulo', 'Poem','poema', 'poeta', 'tags', 'language']]

        return dataset
        
    def tokenize_and_clean(self, data):

        dataset = data.copy()
        # Tokenización y eliminación de stopwords
        lemmatizer = WordNetLemmatizer()
        stopwords_list = stopwords.words('english')
        stopwords_list = [i.replace("'", '') for i in stopwords_list]
        stopwords_list.extend(['yeah', 'oh', 'ah', 'uh',"'",'.','-','?','!'])

        dataset['tokens'] = dataset['poema'].apply(word_tokenize)
        dataset['tokens_sin_stopwords'] = dataset['tokens'].apply(lambda x: [item for item in x if item.lower() not in stopwords_list])
        dataset['tokens_finales'] = dataset['tokens_sin_stopwords'].apply(lambda x: [lemmatizer.lemmatize(i, pos='v') for i in x])
        dataset['texto_final'] = dataset['tokens_finales'].apply(lambda x: ' '.join([i for i in x]))
                
        return dataset

In [6]:
# Ejemplo de uso
data_poemas = pd.read_csv('PoetryFoundationData.csv')
controller = DatasetController()

In [7]:
cleaned_data = controller.clean_dataset(data_poemas)
cleaned_data.to_csv('dataPoemasCleaned.csv')

In [8]:
final_data = controller.tokenize_and_clean(cleaned_data)
final_data.to_csv('dataPoemasTokenized.csv')

In [9]:
final_data.head()

Unnamed: 0,titulo,Poem,poema,poeta,tags,language,tokens,tokens_sin_stopwords,tokens_finales,texto_final
6,Invisible Fish,\r\r\nInvisible fish swim this ghost ocean now...,Invisible fish swim this ghost ocean now descr...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A...",en,"[Invisible, fish, swim, this, ghost, ocean, no...","[Invisible, fish, swim, ghost, ocean, describe...","[Invisible, fish, swim, ghost, ocean, describe...",Invisible fish swim ghost ocean describe wave ...
7,Don’t Bother the Earth Spirit,\r\r\nDon’t bother the earth spirit who lives ...,Don’t bother the earth spirit who lives here. ...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa...",en,"[Don, ’, t, bother, the, earth, spirit, who, l...","[’, bother, earth, spirit, lives, working, sto...","[’, bother, earth, spirit, live, work, story, ...",’ bother earth spirit live work story oldest s...
9,"[""Hour in which I consider hydrangea""]","\r\r\nHour in which I consider hydrangea, a sa...","Hour in which I consider hydrangea, a salt or ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre...",en,"[Hour, in, which, I, consider, hydrangea, ,, a...","[Hour, consider, hydrangea, ,, salt, sand, pla...","[Hour, consider, hydrangea, ,, salt, sand, pla...","Hour consider hydrangea , salt sand plant , va..."
16,scars,\r\r\nmy father’s body is a map\r\r\na record ...,my father’s body is a mapa record of his journ...,Truong Tran,"The Body,Family & Ancestors",en,"[my, father, ’, s, body, is, a, mapa, record, ...","[father, ’, body, mapa, record, journey, carri...","[father, ’, body, mapa, record, journey, carry...",father ’ body mapa record journey carry bullet...
17,what remains two,\r\r\nit has long been forgotten this practice...,it has long been forgotten this practice of th...,Truong Tran,"Infancy,Parenthood,The Body",en,"[it, has, long, been, forgotten, this, practic...","[long, forgotten, practice, motherweaning, chi...","[long, forget, practice, motherweaning, child,...",long forget practice motherweaning child crush...


In [1]:
import pandas as pd

data = pd.read_csv('../data/dataPoemasTokenized.csv')

In [5]:
data.Poem[0]

'\r\r\nInvisible fish swim this ghost ocean now described by waves of sand, by water-worn rock. Soon the fish will learn to walk. Then humans will come ashore and paint dreams on the dying stone. Then later, much later, the ocean floor will be punctuated by Chevy trucks, carrying the dreamers’ decendants, who are going to the store.\r\r\n'

In [8]:
len(data.poeta.unique())

2890