In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.tokenize import word_tokenize
import re

from unicodedata import normalize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [3]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetDepoisDoStemming.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [5]:
df['stemmedSynopsis'] = [remover_acentos(synopsis) for synopsis in df['stemmedSynopsis']]

In [6]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [7]:
def removeNumbersFromSynopsis(text):
    text = re.sub('[0-9]+','',text)
    return text

df['stemmedSynopsis'] = [removeNumbersFromSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
with open(filePath+'logDataSetDepoisDoStemmingERemocaoDeAcentos.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemmed Synopsis without accents", 'stemmedSynopsis'))


df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [9]:
df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [10]:
corpus = []
corpus = df['stemmedSynopsis']

In [11]:
corpus[0]

'jov garot brasil curs ingl exteri poder mei itiner passei estud torvel emoc astr ascens cinem passei intens poder mund celebr glamour ideal import capaz romanc holofot'

In [12]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)

In [13]:
vector_df = pd.DataFrame(vector.toarray())
vector_df.columns = vectorizer.get_feature_names()
vector_df['titulo'] = df['titulo']

In [14]:
def fixColumnsOrder(dataframe):
    cols = dataframe.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    dataframe = dataframe[cols]
    return dataframe

In [15]:
vector_df = fixColumnsOrder(vector_df)

In [16]:
vector_df

Unnamed: 0,titulo,aa,aal,aba,abacax,abad,abajur,abal,abandon,abastec,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,tapete vermelho,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hunter,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,cage,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,espere por mim,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,vox,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,ayra,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,amor plus size,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,o sal da vida,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,aliança,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,a dança dos dragões,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
vector_df.to_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [18]:
vector_df = pd.read_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [19]:
vector_df = vector_df.drop(['Unnamed: 0'],1)

In [20]:
vector_df = vector_df.drop(['titulo'],1)

In [21]:
vector_df

Unnamed: 0,aa,aal,aba,abacax,abad,abajur,abal,abandon,abastec,abat,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
matrixOfDistances = pd.DataFrame(cosine_similarity(vector_df))

In [23]:
matrixOfDistances.to_csv(filePath+'MatrixOfDistances.csv')

In [24]:
matrixOfDistances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9326,9327,9328,9329,9330,9331,9332,9333,9334,9335
0,1.000000,0.016166,0.000000,0.029927,0.000000,0.000000,0.057043,0.054272,0.000000,0.020005,...,0.006823,0.035032,0.024194,0.011626,0.007328,0.000000,0.000000,0.000000,0.102140,0.027999
1,0.016166,1.000000,0.041245,0.097536,0.191505,0.057871,0.004360,0.025773,0.008949,0.009384,...,0.044535,0.016377,0.050074,0.005046,0.061558,0.025594,0.037516,0.024572,0.017525,0.058462
2,0.000000,0.041245,1.000000,0.044834,0.013037,0.018006,0.000000,0.000000,0.035670,0.008074,...,0.013857,0.000000,0.000000,0.000000,0.000000,0.014923,0.000000,0.000000,0.030673,0.007282
3,0.029927,0.097536,0.044834,1.000000,0.114226,0.094417,0.029042,0.052260,0.089068,0.082913,...,0.044248,0.000000,0.034923,0.009652,0.143218,0.023426,0.029278,0.019016,0.017895,0.117815
4,0.000000,0.191505,0.013037,0.114226,1.000000,0.019075,0.014923,0.000000,0.056078,0.000000,...,0.014679,0.000000,0.030182,0.098689,0.044010,0.010145,0.042673,0.000000,0.000000,0.092772
5,0.000000,0.057871,0.018006,0.094417,0.019075,1.000000,0.007760,0.035522,0.000000,0.065751,...,0.078851,0.000000,0.032725,0.000000,0.038136,0.000000,0.155736,0.000000,0.000000,0.057789
6,0.057043,0.004360,0.000000,0.029042,0.014923,0.007760,1.000000,0.029006,0.025869,0.053880,...,0.006540,0.000000,0.000000,0.021921,0.045698,0.019316,0.028905,0.040230,0.057215,0.076425
7,0.054272,0.025773,0.000000,0.052260,0.000000,0.035522,0.029006,1.000000,0.024087,0.069881,...,0.000000,0.102037,0.056106,0.038147,0.036247,0.034049,0.042904,0.022044,0.056681,0.082188
8,0.000000,0.008949,0.035670,0.089068,0.056078,0.000000,0.025869,0.024087,1.000000,0.077586,...,0.000000,0.024224,0.000000,0.000000,0.010734,0.054560,0.009857,0.000000,0.023675,0.056351
9,0.020005,0.009384,0.008074,0.082913,0.000000,0.065751,0.053880,0.069881,0.077586,1.000000,...,0.000000,0.011070,0.010953,0.042199,0.054441,0.093769,0.022035,0.043585,0.009886,0.078897
