In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.tokenize import word_tokenize
import re

from unicodedata import normalize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [3]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetDepoisDoStemming.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [5]:
df['stemmedSynopsis'] = [remover_acentos(synopsis) for synopsis in df['stemmedSynopsis']]

In [6]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [7]:
def removeNumbersFromSynopsis(text):
    text = re.sub('[0-9]+','',text)
    return text

df['stemmedSynopsis'] = [removeNumbersFromSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
with open(filePath+'logDataSetDepoisDoStemmingERemocaoDeAcentos.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemmed Synopsis without accents", 'stemmedSynopsis'))


df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [9]:
df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [10]:
corpus = []
corpus = df['stemmedSynopsis']

In [11]:
corpus[0]

'jov garot brasil resolv aventur curs ingl exteri poder mei itiner passei mescl estud ve torvel emoc conhec astr ascens cinem passei acab send intens pod sonh ve poder mund celebr glamour pod deslumbr ofusc ideal import capaz mant romanc holofot apag'

In [12]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)

In [13]:
vector_df = pd.DataFrame(vector.toarray())
vector_df.columns = vectorizer.get_feature_names()
vector_df['titulo'] = df['titulo']

In [14]:
def fixColumnsOrder(dataframe):
    cols = dataframe.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    dataframe = dataframe[cols]
    return dataframe

In [15]:
vector_df = fixColumnsOrder(vector_df)

In [16]:
vector_df

Unnamed: 0,titulo,aa,aal,ab,aba,abacax,abad,abadess,abaf,abaix,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,Tapete Vermelho,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Espere Por Mim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Vox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Ayra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Amor Plus Size,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,O Sal da Vida,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Aliança,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,A Dança dos Dragões,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
vector_df.to_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [18]:
vector_df = pd.read_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [19]:
vector_df = vector_df.drop(['Unnamed: 0'],1)

In [20]:
vector_df = vector_df.drop(['titulo'],1)

In [21]:
vector_df

Unnamed: 0,aa,aal,ab,aba,abacax,abad,abadess,abaf,abaix,abajur,...,zombet,zombi,zon,zoo,zoolog,zoom,zulu,zumb,zumvel,zuret
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
matrixOfDistances = pd.DataFrame(cosine_similarity(vector_df))

In [23]:
matrixOfDistances.to_csv(filePath+'MatrixOfDistances.csv')

In [24]:
matrixOfDistances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10160,10161,10162,10163,10164,10165,10166,10167,10168,10169
0,1.000000,0.018067,0.008360,0.015446,0.013183,0.000000,0.068224,0.048008,0.030287,0.019349,...,0.004967,0.027053,0.021707,0.018627,0.011530,0.041787,0.000000,0.000000,0.080154,0.033435
1,0.018067,1.000000,0.032383,0.051240,0.195957,0.125094,0.008770,0.049871,0.020897,0.019712,...,0.027924,0.017037,0.047190,0.024922,0.035595,0.013153,0.035296,0.012811,0.029355,0.081287
2,0.008360,0.032383,1.000000,0.022615,0.008126,0.018107,0.000000,0.000000,0.031577,0.014329,...,0.009193,0.007010,0.000000,0.004894,0.009546,0.058355,0.000000,0.000000,0.018854,0.031124
3,0.015446,0.051240,0.022615,1.000000,0.105555,0.115457,0.055015,0.040403,0.077003,0.080270,...,0.026249,0.000000,0.019350,0.010191,0.119002,0.095214,0.035699,0.037546,0.026618,0.094572
4,0.013183,0.195957,0.008126,0.105555,1.000000,0.031331,0.046818,0.061215,0.062125,0.016224,...,0.014777,0.015029,0.021719,0.107475,0.057321,0.028030,0.035300,0.018378,0.011451,0.088004
5,0.000000,0.125094,0.018107,0.115457,0.031331,1.000000,0.053434,0.025015,0.041970,0.052668,...,0.050850,0.009855,0.020175,0.043055,0.023068,0.069397,0.102945,0.000000,0.012864,0.057833
6,0.068224,0.008770,0.000000,0.055015,0.046818,0.053434,1.000000,0.052968,0.047481,0.065945,...,0.014164,0.000000,0.000000,0.024295,0.046160,0.044684,0.028846,0.042134,0.053438,0.075410
7,0.048008,0.049871,0.000000,0.040403,0.061215,0.025015,0.052968,1.000000,0.031788,0.065156,...,0.004236,0.103722,0.044845,0.049463,0.051287,0.034649,0.040568,0.038952,0.066756,0.073528
8,0.030287,0.020897,0.031577,0.077003,0.062125,0.041970,0.047481,0.031788,1.000000,0.065754,...,0.000000,0.021197,0.017363,0.033416,0.008053,0.063611,0.006741,0.000000,0.028890,0.071468
9,0.019349,0.019712,0.014329,0.080270,0.016224,0.052668,0.065945,0.065156,0.065754,1.000000,...,0.000000,0.033142,0.008562,0.034735,0.060598,0.082663,0.026128,0.056457,0.037884,0.088900
