In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

from nltk.tokenize import word_tokenize
import re

from unicodedata import normalize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
filePath = '/Users/luisricardoferraz/book-similarity/test/tutorial/tutorial/spiders/synopsis-preprocessing/'

In [3]:
#Import CSV file to a DataFrame
df = pd.read_csv(filePath+'dataSetDepoisDoStemming.csv')
df = df.drop(['Unnamed: 0'],1)

In [4]:
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [5]:
df['stemmedSynopsis'] = [remover_acentos(synopsis) for synopsis in df['stemmedSynopsis']]

In [6]:
#Count number of characters in each string
def countCharactersInEachString(df, column):
    charactersCount = []
    for synopsis in df[column]:
        charactersCount.append(len(str(synopsis)))
    return charactersCount

#Count number of words in each string
def countWordsInEachString(df, column):
    wordsCount = []
    for synopsis in df[column]:
        words = word_tokenize(str(synopsis))
        wordsCount.append(len(words))
    return wordsCount

#Extract some basic statistics about synopsis
def extractBasicStatisticsAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    basicStatistics = "\t"   
    basicStatistics += "Arithmetic Mean: " + str(np.mean(synopsisLength)) + "\n\t"
    basicStatistics += "Median: " + str(np.median(synopsisLength)) + "\n\t"
    basicStatistics += "Highest Value: " + str(np.max(synopsisLength)) + "\n\t"
    basicStatistics += "Lowest Value: " + str(np.min(synopsisLength)) + "\n\t"
    basicStatistics += "Mid-range: " + str((np.max(synopsisLength)-np.min(synopsisLength))/2) + "\n\t"
    basicStatistics += "Variance: " + str(np.var(synopsisLength,ddof=1)) + "\n\t"
    basicStatistics += "Standard Deviation: " + str(np.std(synopsisLength,ddof=1)) + "\n\t"
    return basicStatistics

def extractPercentilesAboutSynopsis(arrayOfSizes):
    synopsisLength = np.array(arrayOfSizes)
    percentiles = "Percentiles: " + "\n\t"
    count = 0.5
    while count <= 100:
        percentiles += "Percentile " + str(count) + ": " + str(np.percentile(arrayOfSizes,count)) + "\n\t"
        count += 0.5
    return percentiles

#Extract some statistics about this Dataset
def extractStatisticsFromSynopsis(dataframe, title, column):
    log = title + "\n\n"
    log += "Shape of Dataset: " + str(dataframe.shape[0]) + " rows and " + str(dataframe.shape[1]) + " columns" + "\n\n"
    log += "Statistics of Synopsis Length (Characters):" + "\n"
    charactersCount = countCharactersInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(charactersCount) + "\n"
    log += extractPercentilesAboutSynopsis(charactersCount) + "\n"
    log += "Statistics of Synopsis Length (Words):" + "\n"
    wordsCount = countWordsInEachString(dataframe, column)
    log += extractBasicStatisticsAboutSynopsis(wordsCount) + "\n"
    log += extractPercentilesAboutSynopsis(wordsCount) + "\n"
    return log

In [7]:
def removeNumbersFromSynopsis(text):
    text = re.sub('[0-9]+','',text)
    return text

df['stemmedSynopsis'] = [removeNumbersFromSynopsis(text) for text in df['stemmedSynopsis']]

In [8]:
df.to_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
with open(filePath+'logDataSetDepoisDoStemmingERemocaoDeAcentos.txt','w') as log:
    log.write(extractStatisticsFromSynopsis(df, "Statistics of Dataset - Stemmed Synopsis without accents", 'stemmedSynopsis'))


df = pd.read_csv(filePath+'dataSetDepoisDoStemmingERemocaoDeAcentos.csv')
df = df.drop(['Unnamed: 0'],1)

In [9]:
corpus = []
corpus = df['stemmedSynopsis']

In [10]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)

In [11]:
vector_df = pd.DataFrame(vector.toarray())
vector_df.columns = vectorizer.get_feature_names()
vector_df['titulo'] = df['titulo']

In [12]:
def fixColumnsOrder(dataframe):
    cols = dataframe.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    dataframe = dataframe[cols]
    return dataframe

In [13]:
vector_df = fixColumnsOrder(vector_df)

In [14]:
vector_df

Unnamed: 0,titulo,aa,aaa,aaaargh,aal,aalil,aar,aarif,aaron,aash,...,zumvel,zurd,zuret,zusak,zusammenarbeit,zverev,zweig,zygmunt,zym,zyrk
0,Tapete Vermelho,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Espere Por Mim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Vox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Ayra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Amor Plus Size,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,O Sal da Vida,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Aliança,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,A Dança dos Dragões,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
vector_df.to_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [5]:
vector_df = pd.read_csv(filePath+'MatrixOfVectorizedSynopsis.csv')

In [7]:
vector_df = vector_df.drop(['Unnamed: 0'],1)

In [9]:
vector_df = vector_df.drop(['titulo'],1)

In [10]:
vector_df

Unnamed: 0,aa,aaa,aaaargh,aal,aalil,aar,aarif,aaron,aash,ab,...,zumvel,zurd,zuret,zusak,zusammenarbeit,zverev,zweig,zygmunt,zym,zyrk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
matrixOfDistances = pd.DataFrame(cosine_similarity(vector_df))

In [13]:
matrixOfDistances.to_csv(filePath+'MatrixOfDistances.csv')

In [12]:
matrixOfDistances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10212,10213,10214,10215,10216,10217,10218,10219,10220,10221
0,1.000000,0.016450,0.031877,0.011594,0.010607,0.029197,0.047463,0.045683,0.026790,0.017968,...,0.021805,0.023105,0.016549,0.021680,0.014848,0.042992,0.004016,0.036547,0.071151,0.029870
1,0.016450,1.000000,0.028819,0.060990,0.183115,0.146655,0.006117,0.040227,0.018588,0.017932,...,0.026519,0.014490,0.036263,0.022740,0.022499,0.011901,0.022730,0.032334,0.026193,0.063477
2,0.031877,0.028819,1.000000,0.022757,0.010844,0.014555,0.003085,0.000000,0.032530,0.016199,...,0.022468,0.005892,0.000000,0.014022,0.017495,0.054137,0.000000,0.014145,0.023977,0.034036
3,0.011594,0.060990,0.022757,1.000000,0.083055,0.078685,0.034327,0.029704,0.056767,0.066059,...,0.024049,0.002907,0.012277,0.009511,0.065348,0.090793,0.016163,0.021359,0.028155,0.066783
4,0.010607,0.183115,0.010844,0.083055,1.000000,0.026692,0.067573,0.052600,0.062248,0.025689,...,0.015692,0.014205,0.019022,0.090912,0.031988,0.026334,0.017114,0.011200,0.013805,0.065574
5,0.029197,0.146655,0.014555,0.078685,0.026692,1.000000,0.033631,0.046232,0.042730,0.042599,...,0.043468,0.007676,0.021291,0.049175,0.016292,0.062681,0.050288,0.089767,0.010329,0.050545
6,0.047463,0.006117,0.003085,0.034327,0.067573,0.033631,1.000000,0.035071,0.032278,0.045354,...,0.013369,0.002576,0.000000,0.018610,0.023886,0.037064,0.012095,0.022175,0.039762,0.043093
7,0.045683,0.040227,0.000000,0.029704,0.052600,0.046232,0.035071,1.000000,0.040422,0.062146,...,0.006968,0.080308,0.030299,0.050233,0.031649,0.049046,0.038452,0.057443,0.052051,0.062001
8,0.026790,0.018588,0.032530,0.056767,0.062248,0.042730,0.032278,0.040422,1.000000,0.064187,...,0.007887,0.012630,0.017917,0.050685,0.004942,0.056321,0.003591,0.000000,0.030732,0.081394
9,0.017968,0.017932,0.016199,0.066059,0.025689,0.042599,0.045354,0.062146,0.064187,1.000000,...,0.004178,0.027093,0.013463,0.036331,0.033819,0.073955,0.020033,0.032123,0.042435,0.083517
