In [1]:
import nltk
nltk.download('rslp')

import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package rslp to
[nltk_data]     /Users/luisricardoferraz/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [3]:
st = RSLPStemmer()
def applyStemmingOnSynopsis(text):
    stringAfterStemming = []
    for token in word_tokenize(text):
        stringAfterStemming.append(st.stem(token))
    return " ".join(stringAfterStemming)

In [2]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [5]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetComPosTaggingDepoisDaRemocaoDeStopwords.csv')
df = df.drop(['Unnamed: 0'],1)

In [6]:
df['stemmedSynopsis'] = df['synopsisWithoutStopwords']

In [7]:
df['stemmedSynopsis'] = [applyStemmingOnSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemming.csv')
with open(filePath+'logDataSetDepoisDoStemming.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemming Applied to Synopsis", 'stemmedSynopsis'))

In [3]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetDepoisDoStemming.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
df['stemmedSynopsis'][0]

'jov garot brasil curs ingl exteri poder mei itiner passei estud torvel emoç astr ascens cinem passei intens poder mund celebr glamour ideal import capaz romanc holofot'