In [1]:
import nltk
nltk.download('rslp')

import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package rslp to
[nltk_data]     /Users/luisricardoferraz/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [3]:
st = RSLPStemmer()
def applyStemmingOnSynopsis(text):
    stringAfterStemming = []
    for token in word_tokenize(text):
        stringAfterStemming.append(st.stem(token))
    return " ".join(stringAfterStemming)

In [4]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [5]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetComPosTaggingDepoisDaRemocaoDeStopwords.csv')
df = df.drop(['Unnamed: 0'],1)

In [6]:
df['stemmedSynopsis'] = df['synopsisWithoutStopwords']

In [7]:
df['stemmedSynopsis'] = [applyStemmingOnSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemming.csv')
with open(filePath+'logDataSetDepoisDoStemming.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemming Applied to Synopsis", 'stemmedSynopsis'))

In [9]:
df

Unnamed: 0,titulo,sinopse,synopsisWithoutStopwords,adjective_after,conjunction_after,determiner_after,noun_after,pronoun_after,adverb_after,adposition_after,verb_after,number_after,date_after,interjection_after,punctuation_after,unknown_after,total_after,stemmedSynopsis
0,Tapete Vermelho,"[{'token': 'uma', 'poss': 'DI0', 'lemma': 'um'...",jovem garota brasileira resolve aventurar curs...,6,0,0,21,0,0,0,15,0,0,0,0,0,42,jov garot brasil resolv aventur curs ingl exte...
1,Hunter,"[{'token': 'resgate', 'poss': 'NCMS', 'lemma':...",resgate primeira palavra ouvi tiros cessaram f...,6,0,0,23,0,0,0,24,0,0,0,0,0,53,resgat prim palavr ouv tir cess fer tir ouv le...
2,Cage,"[{'token': 'cage', 'poss': 'VMI', 'lemma': '<u...",substantivo cela prisão verbo prender enjaular...,5,0,0,21,0,0,0,21,0,0,0,0,0,47,substan cel pris verb prend enjaul ment fácel ...
3,Espere Por Mim,"[{'token': 'quebrado', 'poss': 'VMP', 'lemma':...",quebrado irritado abandonado sentia dias lutav...,5,0,0,19,0,0,0,27,0,0,0,0,0,51,quebr irrit abandon sent dia lut monstr viv re...
4,Vox,"[{'token': 'uma', 'poss': 'DI0', 'lemma': 'um'...",atual próxima dias luta feminina silêncio pode...,4,0,0,25,0,0,0,20,0,0,0,0,0,49,atual próx dia lut feminin silênci pod ser ens...
5,Ayra,"[{'token': 'as', 'poss': 'DA0', 'lemma': 'o'},...",histórias contos fadas crianças ouvem fizeram ...,2,0,0,17,0,0,0,23,0,0,0,0,0,42,histór cont fad crianç ouv fiz sent form desco...
6,Amor Plus Size,"[{'token': 'maitê', 'poss': 'NCFP', 'lemma': '...",passos garota linda dezessete anos quilos pass...,11,0,0,28,0,0,0,14,0,0,0,0,0,53,pass garot lind dezesset ano quil pass infânc ...
7,O Sal da Vida,"[{'token': 'existe', 'poss': 'VMI', 'lemma': '...",existe forma leveza graça simples fato existir...,12,0,0,26,0,0,0,14,0,0,0,0,0,52,exist form lev graç simpl fat exist vai ocup s...
8,Aliança,"[{'token': 'um', 'poss': 'DI0', 'lemma': 'um'}...",thriller luta secreta bem mal pessoas habilida...,9,0,0,23,0,0,0,14,0,0,0,0,0,46,thrill lut secret bem mal pesso habil espec ex...
9,A Dança dos Dragões,"[{'token': 'as', 'poss': 'DA0', 'lemma': 'o'},...",crônicas gelo fogo personagens presentes quint...,18,0,0,59,0,0,0,25,0,0,0,0,0,102,crôn gel fog person pres quint volum séri danç...
