# Análise de sentimento dos livros da Bíblia

## Luís Guilherme Ribeiro

## Importando libs

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luisguilhermeribeiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/luisguilhermeribeiro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/luisguilhermeribeiro/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Coletando dados

In [2]:
# Importação dos dados
df_path = "./biblia_almeida_completa.csv"
df = pd.read_csv(df_path)
df.shape

(31482, 11)

## Aplicando lematização

In [3]:
#Lematização reduz as palavras flexionadas adequadamente, garantindo que a palavra raiz pertença ao idioma (dicionario)
def lem(a):
    p = nltk.WordNetLemmatizer()
    b = []
    for line in a:

        split_line = line.split(' ')
        length=len(split_line)
        new_line = []

        for word in range(length):
            if word == 0:
                new_line.append(str(p.lemmatize(split_line[word], pos="v")))
            else:
                new_line[0] = new_line[0] + ' ' + (str(p.lemmatize(split_line[word], pos="v")))

        b.append(new_line[0])

    return b

In [4]:
df['t_lem']=lem(df.texto)

## Agrupamento e SVD

Eu uso vetorizador e decomposição de valor singular para criar clusters de texto e mesclá-los com os dados:

In [5]:
#Vetorizador (entender e explorar) [CONTA AS PRINCIPAIS PALAVRAS]
vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 1000)
X = vectorizer.fit_transform(df.t_lem)
# Singular Value Decomposition (SVD) = reduz a dimensão
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X = svd.fit_transform(X)

cluster_data = pd.DataFrame({'Comp1': X[:,0], 'Comp2': X[:,1], 'Testamento': df.testamento, 'Livro': df.livro, 
                             'Periodo': df.periodo, 'Localizacao': df.localizacao, 'Tempo': df.tempo, 'Autor': df.autor})
cluster_data.head()

Unnamed: 0,Comp1,Comp2,Testamento,Livro,Periodo,Localizacao,Tempo,Autor
0,0.165387,0.047691,antigo,Genesis,Persian,Israel,-500,Moises
1,0.061626,-0.010588,antigo,Genesis,Persian,Israel,-500,Moises
2,0.079983,-0.071173,antigo,Genesis,Persian,Israel,-500,Moises
3,0.108721,-0.065556,antigo,Genesis,Persian,Israel,-500,Moises
4,0.070745,-0.028995,antigo,Genesis,Persian,Israel,-500,Moises


In [None]:
f, axes = plt.subplots(1, 2, figsize=(18, 6))

sns.scatterplot('Comp1', 'Comp2', data=cluster_data, hue='Testamento', ax=axes[0], style="Testamento").set_title('Por Testamento')
sns.scatterplot('Comp1', 'Comp2', data=cluster_data, hue='Periodo', ax=axes[1], style="Periodo").set_title('Por Periodo')

## Padrões de agrupamento
Agora vamos aplicar o algortimo kmeans a fim de identificar padrões nos agrupamento de alguns livros.

Com isso, será possível entender certos padrões de escritas de alguns autores, tais como: Paulo, João e Lucas.

In [None]:
# Método de elbow para saber o número ideal de clusters
sns.set(rc={'figure.figsize':(10, 10)})
wcss = []

for i in range(1, 30):
    clustering = KMeans(n_clusters=i, init='k-means++', random_state=42)
    clustering.fit(X)
    wcss.append(clustering.inertia_)
    
ks = range(1, 30)
sns.lineplot(x = ks, y = wcss);

In [None]:
kmeans = KMeans(n_clusters = 5)
kmeans.fit(X)
Counter(kmeans.labels_)

In [None]:
df_cluster = df
df_cluster['cluster'] = kmeans.labels_

### Identificando o autor do livro de Hebreus
explicar

In [None]:
cts_paulo = df_cluster[df_cluster['autor'] == 'Paulo']['cluster'].value_counts()
idx = cts_paulo.index
i = 0
print('Paulo')
for x in cts_paulo:
    perc = round(x*100/sum(cts_paulo))
    print(idx[i], ' -> ',perc, '%')
    i = i +1

In [None]:
cts_lucas = df_cluster[df_cluster['autor'] == 'Lucas']['cluster'].value_counts()
idx = cts_lucas.index
i = 0
print('Lucas')
for x in cts_lucas:
    perc = round(x*100/sum(cts_lucas))
    print(idx[i], ' -> ',perc, '%')
    i = i +1

In [None]:
cts_hebreus = df_cluster[df_cluster['livro'] == 'Hebreus']['cluster'].value_counts()
idx = cts_hebreus.index
i = 0
print('Hebreus')
for x in cts_hebreus:
    perc = round(x*100/sum(cts_hebreus))
    print(idx[i], ' -> ',perc, '%')
    i = i +1

In [None]:
analisar e concluir