In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.tokenize import word_tokenize
import re

from unicodedata import normalize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [3]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetDepoisDoStemming.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [5]:
df['stemmedSynopsis'] = [remover_acentos(synopsis) for synopsis in df['stemmedSynopsis']]

In [3]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [7]:
def removeNumbersFromSynopsis(text):
    text = re.sub('[0-9]+','',text)
    return text

df['stemmedSynopsis'] = [removeNumbersFromSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
with open(filePath+'logDataSetDepoisDoStemmingERemocaoDeAcentos.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemmed Synopsis without accents", 'stemmedSynopsis'))


df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [5]:
corpus = []
corpus = df['stemmedSynopsis']

In [6]:
len(corpus)

10170

In [7]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)

In [8]:
vector_df = pd.DataFrame(vector.toarray())
vector_df.columns = vectorizer.get_feature_names()
vector_df['titulo'] = df['titulo']

In [9]:
def fixColumnsOrder(dataframe):
    cols = dataframe.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    dataframe = dataframe[cols]
    return dataframe

In [10]:
vector_df = fixColumnsOrder(vector_df)

In [11]:
vector_df

Unnamed: 0,titulo,aa,aal,aba,abacax,abad,abajur,abal,abandon,abastec,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,Tapete Vermelho,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hunter,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cage,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Espere Por Mim,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Vox,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Ayra,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Amor Plus Size,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,O Sal da Vida,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Aliança,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,A Dança dos Dragões,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
vector_df.to_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [3]:
vector_df = pd.read_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [4]:
vector_df = vector_df.drop(['Unnamed: 0'],1)

In [5]:
vector_df = vector_df.drop(['titulo'],1)

In [6]:
vector_df

Unnamed: 0,aa,aal,aba,abacax,abad,abajur,abal,abandon,abastec,abat,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
matrixOfDistances = pd.DataFrame(cosine_similarity(vector_df))

In [8]:
matrixOfDistances.to_csv(filePath+'MatrixOfDistances.csv')

In [9]:
matrixOfDistances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10160,10161,10162,10163,10164,10165,10166,10167,10168,10169
0,1.000000,0.016080,0.000000,0.029771,0.000000,0.000000,0.056553,0.053708,0.000000,0.020096,...,0.006715,0.035321,0.024102,0.011474,0.007308,0.000000,0.000000,0.000000,0.101723,0.027722
1,0.016080,1.000000,0.041151,0.096832,0.191309,0.059147,0.004309,0.025690,0.008841,0.009389,...,0.045196,0.016172,0.050084,0.004983,0.060733,0.025526,0.037742,0.024725,0.017435,0.058370
2,0.000000,0.041151,1.000000,0.044763,0.013057,0.018150,0.000000,0.000000,0.035764,0.008106,...,0.013869,0.000000,0.000000,0.000000,0.000000,0.015065,0.000000,0.000000,0.030687,0.007269
3,0.029771,0.096832,0.044763,1.000000,0.112914,0.091472,0.028815,0.051943,0.087026,0.084231,...,0.044098,0.000000,0.034884,0.009516,0.143178,0.023204,0.029163,0.019026,0.017723,0.117147
4,0.000000,0.191309,0.013057,0.112914,1.000000,0.019143,0.014739,0.000000,0.054515,0.000000,...,0.014628,0.000000,0.029954,0.099158,0.043623,0.010128,0.042327,0.000000,0.000000,0.092549
5,0.000000,0.059147,0.018150,0.091472,0.019143,1.000000,0.007728,0.035914,0.000000,0.063054,...,0.079999,0.000000,0.033120,0.000000,0.038282,0.000000,0.155916,0.000000,0.000000,0.058215
6,0.056553,0.004309,0.000000,0.028815,0.014739,0.007728,1.000000,0.028824,0.025631,0.054295,...,0.006472,0.000000,0.000000,0.021577,0.045562,0.019063,0.029062,0.040390,0.056791,0.075880
7,0.053708,0.025690,0.000000,0.051943,0.000000,0.035914,0.028824,1.000000,0.023785,0.070813,...,0.000000,0.101889,0.056170,0.038062,0.035918,0.033741,0.042982,0.022173,0.056790,0.081589
8,0.000000,0.008841,0.035764,0.087026,0.054515,0.000000,0.025631,0.023785,1.000000,0.078015,...,0.000000,0.024228,0.000000,0.000000,0.010586,0.054054,0.009754,0.000000,0.023575,0.055658
9,0.020096,0.009389,0.008106,0.084231,0.000000,0.063054,0.054295,0.070813,0.078015,1.000000,...,0.000000,0.011158,0.010935,0.043424,0.055335,0.094315,0.022018,0.044323,0.010003,0.079423
