In [None]:
import pandas as pd
import re

# Import de NTLK et du corpus inaugural
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# Import bible csv
bible_df = pd.read_csv('bible.csv')

In [None]:
# Stats descriptives sur le dataframe
bible_df.rename(columns={
    'b': 'book_id', 
    'c': 'chapter_id',
    'v': 'verse_id',
    't': 'text'
    }, inplace=True)

bible_df

In [None]:
# Now, clean !

# Use lambda to apply the function to each row of the DataFrame
def remove_all_punctuation(text):
   return re.sub(r'[^\w\s]', ' ', text)

def remove_partial_punctuation(text): # except .!?
   return re.sub(r'[^\w\s.!?]', ' ', text)

def carriage_return(text):
   return re.sub(r'\n', ' ', text)

def remove_double_space(text): # remove when more than 2 spaces
   return re.sub(r'[ ]{2,}', ' ', text)

def remove_empty_strings(text):
    return list(filter(None, text))

def remove_stopwords(text):
   text = tokenize(text)
   filtered_words = [word for word in text if word.lower() not in stopwords.words('english')]
   return filtered_words

def tokenize(text):
   return text.split(' ')

def join(text):
   return ' '.join(text)

def lemmatize(text):
    WNlemma = nltk.WordNetLemmatizer()
    lemmatized = []
    for token in text:
        lemmatized.append(WNlemma.lemmatize(token))
    return lemmatized

def remove_numbers(text):
    numbers = '0123456789'
    for number in numbers:
        text = text.replace(number, '')
    return text

def remove_short_words(word_list):
    return [word for word in word_list if len(word) > 1]

In [None]:
# Text without stopwords
bible_df['cleaned'] = bible_df['text'] \
   .apply(lambda x: remove_stopwords(x)) \
   .apply(lambda x: join(x)) \
   .apply(lambda x: carriage_return(x)) \
   .apply(lambda x: remove_double_space(x)) \
   .apply(lambda x: x.lower()) \
   .apply(lambda x: remove_all_punctuation(x)) \
   .apply(lambda x: remove_numbers(x)) \
   .apply(lambda x: tokenize(x)) \
   .apply(lambda x: remove_empty_strings(x)) \
   .apply(lambda x: lemmatize(x)) \
   .apply(lambda x: remove_short_words(x))
   
bible_df

# Statistiques du texte (avec visualisations)

In [None]:
# Add descriptive columns
bible_df['word_count'] = bible_df['cleaned'].apply(lambda x: len(' '.join(x).split()))
bible_df['unique_words'] = bible_df['cleaned'].apply(lambda x: len(set(x)))
bible_df['avg_word_length'] = bible_df['cleaned'].apply(lambda x: sum(len(word) for word in x) / len(x) if len(x) > 0 else 0) 
bible_df['sentence_count'] = bible_df['text'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))

bible_df





In [None]:
import matplotlib.pyplot as plt

# Top des livres par nombre de mots (top 20) pour la lisibilité
top_books = bible_df.groupby('book_id')['word_count'].sum().nlargest(66)
average_word_count = top_books.mean()

top_books.plot(kind='bar', figsize=(10, 6))
plt.axhline(average_word_count, color='red', linestyle='--', label='Average')
plt.xlabel('Book ID')
plt.ylabel('Word Count')
plt.title('Top Books by Word Count')
plt.legend()
plt.show()



In [None]:
import numpy as np

# Nombres de mots par livre de la bible 
plt.figure(figsize=(10, 6))
bible_df_sorted = bible_df.sort_values('avg_word_length', ascending=False)  # Sort in descending order
plt.bar(bible_df_sorted['book_id'], bible_df_sorted['avg_word_length'])
plt.xlabel('Book ID')
plt.ylabel('Average Word Length')
plt.title('Average Word Length by Book')
plt.show()



## 2. Nettoyage des données

Définition de plusieurs fonctions de nettoyage des données, et application de ces dernières sur différentes colonnes du dataframe, correspondant à un niveau de nettoyage différent qui sera utile pour les prochaines étapes. 

In [None]:
import re

# Use lambda to apply the function to each row of the DataFrame
def remove_all_punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def remove_partial_punctuation(text): # except .!?
    return re.sub(r'[^\w\s.!?]', ' ', text)

def carriage_return(text):
    return re.sub(r'\n', ' ', text)

def remove_double_space(text): # remove when more than 2 spaces
    return re.sub(r'[ ]{2,}', ' ', text)

def remove_empty_strings(text):
     return list(filter(None, text))

def remove_stopwords(text):
    text = tokenize(text)
    filtered_words = [word for word in text if word.lower() not in stopwords.words('english')]
    return filtered_words

def tokenize(text):
    return text.split(' ')

def join(text):
    return ' '.join(text)

def lemmatize(text):
     WNlemma = nltk.WordNetLemmatizer()
     lemmatized = []
     for token in text:
          lemmatized.append(WNlemma.lemmatize(token))
     return lemmatized

def remove_numbers(text):
     numbers = '0123456789'
     for number in numbers:
          text = text.replace(number, '')
     return text

def remove_single_letter_words(text):
    filtered_words = [word for word in text if len(word) > 1]
    return filtered_words

bible_df_cleaned = bible_df.copy()

# Text without stopwords
# bible_df_cleaned['text_without_stopwords'] = bible_df_cleaned['text'] \
#     .apply(lambda x: remove_stopwords(x)) \
#     .apply(lambda x: join(x)) \
#     .apply(lambda x: carriage_return(x)) \
#     .apply(lambda x: remove_double_space(x))

# # Clean text partially for later
# bible_df_cleaned['text_partially_cleaned'] = bible_df_cleaned['text_without_stopwords'] \
#     .apply(lambda x: remove_partial_punctuation(x)) \
#     .apply(lambda x: remove_double_space(x)) \
#     .apply(lambda x: x.lower()) \

# # Advanced cleaning
# bible_df_cleaned['text'] = bible_df_cleaned['text_partially_cleaned'] \
#     .apply(lambda x: remove_all_punctuation(x)) \
#     .apply(lambda x: remove_numbers(x)) \
#     .apply(lambda x: remove_double_space(x)) \
#     .apply(lambda x: tokenize(x)) \
#     .apply(lambda x: remove_single_letter_words(x)) \
#     .apply(lambda x: remove_empty_strings(x)) \
#     .apply(lambda x: lemmatize(x))

# Test cleaning efficiency printing the dataframe
bible_df_cleaned


In [None]:
# Test cleaning efficiency printing the first row
# cela permet de comparer les différentes étapes de nettoyage

print('Cleaned text: \n' + str(bible_df_cleaned['text'][0]))
print('Cleaned text: \n' + str(bible_df_cleaned['cleaned'][0]))
# print('Partially cleaned text (without puncutation except points):\n' + str(bible_df_cleaned['text_partially_cleaned'][0]))
# print('Text without stopwords and carriage return: \n' + str(bible_df_cleaned['text_without_stopwords'][0]))


## 3.Analyse des données

### Analyse des mots les plus fréquents

In [None]:
from collections import Counter

# Concatenate all the cleaned text into a single string
all_text = ' '.join(bible_df_cleaned['cleaned'].sum())

# Count the frequency of each word in the text
word_counts = Counter(all_text.split())

# Get the top 10 most common words
top_words = word_counts.most_common(10)

# Print the top words
for word, count in top_words:
    print(f'{word}: {count}')


In [None]:
#Number of verse in the bible
print("Number of verse in the bible : ", len(bible_df_cleaned))

In [None]:
unique_words = len(bible_df_cleaned['text'].explode().unique())
print("Number of unique words in the Bible:", unique_words)

In [None]:
# Count the number of times "Jesus" appears in the Bible
jesus_count = bible_df_cleaned['cleaned'].explode().str.count('jesus').sum()

# Print the result
print("Number of times 'Jesus' appears in the Bible:", int(jesus_count))

In [None]:
god_count = bible_df_cleaned['cleaned'].explode().str.count('god').sum()
print("Number of times 'God' appears in the Bible:", int(god_count))


In [None]:
import matplotlib.pyplot as plt

# Group the dataframe by book_id and calculate the sum of occurrences for each word
god_counts = bible_df_cleaned.groupby('book_id')['cleaned'].apply(lambda x: x.explode().str.count('god').sum())
jesus_counts = bible_df_cleaned.groupby('book_id')['cleaned'].apply(lambda x: x.explode().str.count('jesus').sum())

# Create a line plot
plt.figure(figsize=(10, 6))
plt.plot(god_counts.index, god_counts.values, label='God')
plt.plot(jesus_counts.index, jesus_counts.values, label='Jesus')
plt.xlabel('Book ID')
plt.ylabel('Number of Occurrences')
plt.title('Evolution of Word Occurrences (God and Jesus)')
plt.legend()
plt.show()

In [None]:
# Filter the dataframe to include only books 1 to 37
old_testament_df = bible_df_cleaned[(bible_df_cleaned['book_id'] >= 1) & (bible_df_cleaned['book_id'] <= 37)]

# Count the occurrences of the word "jesus" in the filtered dataframe
jesus_count = old_testament_df['text'].explode().str.count('jesus').sum()

# Print the result
print("Number of times 'Jesus' appears between books 1 and 37:", jesus_count)

In [None]:
import matplotlib.pyplot as plt

# Noms des apôtres
apostles = ['Simon', 'Andrew', 'James', 'John', 'Philip', 'Bartholomew', 'Matthew', 'Thomas', 'James (son of Alphaeus)', 'Thaddaeus', 'Simon the Zealot', 'Judas Iscariot']

# Initialiser un dictionnaire pour stocker les occurrences des noms des apôtres par livre
apostle_counts = {apostle: [] for apostle in apostles}

# Parcourir les apôtres
for apostle in apostles:
    # Calculer les occurrences pour chaque apôtre à partir du livre 35
    counts = bible_df_cleaned[bible_df_cleaned['book_id'] >= 38].groupby('book_id')['cleaned'].apply(lambda x: x.explode().str.count(apostle.lower()).sum())
    apostle_counts[apostle] = counts.values

# Créer un graphique en ligne pour chaque apôtre
plt.figure(figsize=(15, 8))
for apostle in apostles:
    plt.plot(counts.index, apostle_counts[apostle], label=apostle)

plt.xlabel('ID du Livre')
plt.ylabel('Nombre d\'Occurrences')
plt.title('Évolution des Occurrences des Noms des Apôtres à partir du Livre 38 de la Bible')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
## 4.Transformation des données
### POS Tagging

In [None]:
import nltk

# Perform part-of-speech tagging on the "cleaned" column
bible_df_cleaned['pos_tags'] = bible_df_cleaned['cleaned'].apply(lambda x: nltk.pos_tag(x))

# Print only the "pos_tags" column
print(bible_df_cleaned['pos_tags'])


In [None]:
import nltk
from collections import Counter
import matplotlib.pyplot as plt

# Download the NLTK data for part-of-speech tagging
nltk.download('averaged_perceptron_tagger')

# Perform part-of-speech tagging on the "cleaned" column
bible_df_cleaned['pos_tags'] = bible_df_cleaned['cleaned'].apply(lambda x: nltk.pos_tag(x))

# Extract all the nouns from the "pos_tags" column
nouns = [word for tags in bible_df_cleaned['pos_tags'] for word, pos in tags if pos.startswith('NN')]

# Count the frequency of each noun
noun_counts = Counter(nouns)

# Select the top 10 most frequent nouns
top_nouns = noun_counts.most_common(10)

# Extract the nouns and their frequencies
nouns, frequencies = zip(*top_nouns)

# Plot the frequency of the top 10 nouns
plt.figure(figsize=(10, 6))
plt.bar(nouns, frequencies)
plt.xlabel('Nouns')
plt.ylabel('Frequency')
plt.title('Top 10 Most Common Nouns (NN) in the Bible')
plt.xticks(rotation=45)
plt.show()


### NER 

In [None]:
# Downloading resources
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Tokenization and POS tagging
tokens = bible_df_cleaned['cleaned']
pos_tags = bible_df_cleaned['pos_tags']

print(tokens[0])
print(pos_tags[0])


In [None]:
import nltk

# Perform Named Entity Recognition (NER)
ner_tags = [nltk.ne_chunk(tags) for tags in pos_tags]

# Print the NER tags
for tags in ner_tags:
    print(tags)


### Bag of Words 

In [None]:
# get all uniques words and add them to a dataframe

cleaned_corpus = bible_df_cleaned['cleaned'].apply(lambda x: ' '.join(x))

from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer()
cv_matrix = cv.fit_transform(cleaned_corpus)

cv_matrix1 = cv_matrix.toarray()
cv_matrix1

# get all unique words in the corpus
vocab = cv.get_feature_names_out()
# show document feature vectors
bible_df_feature_vectors = pd.DataFrame(cv_matrix1, columns=vocab)

bible_df_feature_vectors

### N-Gram Model

Récupération des bi-grammes les plus intéressants, avec une occurence tous textes confondus supérieure à 30

In [None]:
# TODO: Not working, need to sample bigrams randomly

# bv = CountVectorizer(ngram_range=(2,2))
# bv_matrix = bv.fit_transform(cleaned_corpus)
# bv_matrix = bv_matrix.toarray()
# vocab = bv.get_feature_names_out()
# bible_df_bigram_trigram = pd.DataFrame(bv_matrix, columns=vocab)
# bible_df_bigram_trigram.sum()
# # Delete columns with less than 25 occurences everywhere 
# df_inaugural_corpus_bigram_trigram = bible_df_bigram_trigram.loc[:, (bible_df_bigram_trigram.sum(axis=0) >= 2)]

# Add year column to the dataframe as index
# bible_bigram_trigram.insert(0, 'year', bible_df_bigram_trigram['year'].values)
# bible_bigram_trigram

# # Plot the most common bigrams and trigrams by year
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10,6))
# plt.ylabel('Inaugural words count')
# plt.xlabel('Year') 
# plt.title('Most common bigrams and trigrams by year of the inaugural discourses')
# plt.subplots_adjust(bottom=0.15)
# plt.xticks(rotation=90)
# # make one plot for each bigram and trigram
# for column in bible_bigram_trigram.drop(columns=['year']).columns:
#     plt.plot(bible_bigram_trigram['year'], bible_df_bigram_trigram[column])

# # add legend    
# plt.legend(bible_bigram_trigram.drop(columns=['year']).columns, loc='upper left')


# bible_bigram_trigram



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Concatenate all the cleaned text into a single string
all_text = ' '.join(bible_df_cleaned['cleaned'].sum())

# Create a CountVectorizer object with ngram_range=(2, 2)
cv = CountVectorizer(ngram_range=(2, 2))

# Fit the CountVectorizer on the concatenated text
cv.fit([all_text])

# Get the vocabulary (bigrams) and their counts
vocab = cv.get_feature_names_out()
counts = cv.transform([all_text]).toarray().flatten()

# Filter the bigrams based on the occurrence threshold
filtered_bigrams = [bigram for bigram, count in zip(vocab, counts) if count > 30]

# Print the filtered bigrams
print(filtered_bigrams)


### TF-IDF

In [None]:
# Import des librairies
import scipy.sparse as sp
import numpy as np

# Reuse BOW from before
feature_names = list(bible_df_feature_vectors.columns)
# build the document frequency matrix
df = np.diff(sp.csc_matrix(bible_df_feature_vectors, copy=True).indptr)
df = 1 + df # adding 1 to smoothen idf later
# show smoothened document frequencies
pd.DataFrame([df], columns=feature_names)

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer().fit(bible_df_feature_vectors)
len(vect.get_feature_names_out())

In [None]:
X_vectorized = vect.transform(bible_df_feature_vectors)

In [None]:
feature_names = np.array(vect.get_feature_names_out())

sorted_tfidf_index = X_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
# Créer la matrice TF-IDF
vectorizer_tfidf = TfidfVectorizer(token_pattern=r'\b\w{8,}\b')
tfidf_matrix = vectorizer_tfidf.fit_transform(bible_df_cleaned['text'])

# Obtenir les noms des termes
feature_names = vectorizer_tfidf.get_feature_names_out()

# Créer une liste pour stocker les résultats de chaque livre
tfidf_book_results = []

# Parcourir chaque livre et obtenir les 3 termes avec les scores TF-IDF les plus élevés
for idx, book_text in enumerate(bible_df_cleaned['text']):
    # Obtenir le nom du livre à partir de l'index
    book_name = idx + 1
    
    # Obtenir les scores TF-IDF pour chaque terme dans le livre
    tfidf_scores = tfidf_matrix[idx, :].toarray()[0]
    
    # Créer une liste de tuples (terme, score TF-IDF) pour les termes de plus de 7 lettres
    term_tfidf_list = [(term, round(score, 2)) for term, score in zip(feature_names, tfidf_scores) if len(term) > 6]

    # Trier la liste par score TF-IDF décroissant et prendre les 3 premiers termes
    sorted_tfidf_list = sorted(term_tfidf_list, key=lambda x: x[1], reverse=True)[:3]

    # Ajouter les 3 termes avec leurs scores TF-IDF aux résultats
    tfidf_book_results.append((book_name, sorted_tfidf_list))

# Créer un dataframe à partir des résultats
tfidf_df = pd.DataFrame(tfidf_book_results, columns=['Livre', 'Top 3 Termes TF-IDF'])

tfidf_df


# Clustering

In [None]:
# TF-IDF with fit_transform
# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, max_features=1000, stop_words='english')

# Fit and transform the complaints text to create embeddings
tfidf_matrix = tfidf_vectorizer.fit_transform(bible_df_cleaned['text'].apply(lambda x: " ".join(x)))
 
tfidf_matrix.shape


In [None]:
from sklearn.decomposition import PCA
 
# Initialize PCA and reduce dimensionality to 2 components
pca = PCA(n_components=2)
reduced_tfidf = pca.fit_transform(tfidf_matrix.toarray())
 
reduced_tfidf.shape

In [None]:
# Test d'Elbow pour déterminer le nombre optimal de clusters
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
 
# Determine the optimal number of clusters using the Elbow method
scores = []  # within-cluster sum of squares
cluster_range = range(1, 10)  # test up to 10 clusters
 
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=100)
    kmeans.fit(reduced_tfidf)
    scores.append(kmeans.inertia_)
 
#Plot the Elbow method
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, scores, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid(True)
plt.show()

In [None]:
# Perform KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=4, random_state=100)
clusters = kmeans.fit_predict(reduced_tfidf)
 
# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(reduced_tfidf[:, 0], reduced_tfidf[:, 1], c=clusters, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='black', marker='X', label='Centroids')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Clusters of Inaugural discourses')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
X_train = bible_df_cleaned['text']
y_km = kmeans.fit_predict(reduced_tfidf)
df_km = pd.DataFrame({'statements' :X_train, 'topic_cluster' :y_km })
print(df_km.groupby('topic_cluster').count())
df_km.head(10)

# LDA

In [None]:
from gensim import corpora
import gensim

## On crée notre dictionnaire des données
id2word = corpora.Dictionary(bible_df_cleaned['text'])
## On crée nos vecteurs avec Term Document Frequency
corpus = [id2word.doc2bow(text) for text in bible_df_cleaned['text']]

In [None]:
from pprint import pprint

# number of topics
num_topics = 4
# Build LDA model
lda_model3 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                        random_state=100)
# Print the Keyword in the 3 topics
pprint(lda_model3.print_topics(num_topics))
doc_lda3 = 0

In [None]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model3, texts=bible_df_cleaned['text'], dictionary=id2word, coherence='u_mass')
coherence_model_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_model_lda)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
# Visualisation of topics 
vis_data = gensimvis.prepare(lda_model3, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis_data, 'output_topics_LDA.html')
pyLDAvis.display(vis_data)

In [None]:
## A garder en plus
# Choosing number of topics Round 2
topics = list(range(1,20))
coherences = []

for no_topics in topics:
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=no_topics, random_state=100)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=bible_df_cleaned['text'], dictionary=id2word, coherence='u_mass')
    coherences.append(coherence_model_lda.get_coherence())
    print(coherence_model_lda.get_coherence())
doc_lda = lda_model[corpus]

# 3.Sentiment Analysis avec NLTK (from nltk.sentiment import sentimentIntensityAnalyzer)

In [None]:
# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer




In [None]:
bible_df_cleaned

In [None]:
# Import libraries
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources
nltk.download('vader_lexicon')

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Convert the list of words to a string
bible_df_cleaned['cleaned_text'] = bible_df_cleaned['cleaned'].apply(lambda x: ' '.join(x))




In [None]:
# Define a function to calculate sentiment score for a given verse
def get_sentiment_score(verse):
    # Calculate sentiment score
    sentiment_score = sia.polarity_scores(verse)['compound']
    return sentiment_score

# Apply the sentiment analysis function to the 'cleaned_text' column in your DataFrame
bible_df_cleaned['sentiment_score'] = bible_df_cleaned['cleaned_text'].apply(get_sentiment_score)

# Categorize sentiment based on the sentiment score
bible_df_cleaned['sentiment'] = bible_df_cleaned['sentiment_score'].apply(lambda score: 'positive' if score > 0 else 'negative' if score < 0 else 'neutral')

# Display the DataFrame with sentiment scores and categories
print(bible_df_cleaned[['text', 'sentiment_score', 'sentiment']])


In [None]:
# Create a new DataFrame for visualization

visualization_df = pd.DataFrame({
    'Verse': bible_df_cleaned['text'],
    'Sentiment Score': bible_df_cleaned['sentiment_score'],
    'Sentiment': bible_df_cleaned['sentiment']
})

visualization_df


In [None]:
negative_verses = visualization_df[visualization_df['Sentiment'] == 'negative']
negative_verses

# exemple words negative : evil, wicked, hate, kill, death, sin, devil, hell, etc.

In [None]:
import matplotlib.pyplot as plt

# Calculate the count of each sentiment category
sentiment_counts = visualization_df['Sentiment'].value_counts()

# Plot the sentiment counts
plt.bar(sentiment_counts.index, sentiment_counts.values)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Analysis')
plt.show()


In [None]:
visualization_df['book_id'] = bible_df_cleaned['book_id']
visualization_df

In [None]:
import matplotlib.pyplot as plt

# Calculate the count of positive and negative sentiments for each book_id
sentiment_counts = visualization_df.groupby('book_id')['Sentiment'].value_counts().unstack().fillna(0)

# Determine if each book_id is positive or negative based on sentiment counts
sentiment_counts['Book Sentiment'] = sentiment_counts.apply(lambda row: 'positive' if row['positive'] > row['negative'] else 'negative', axis=1)

# Plot the book sentiments
plt.figure(figsize=(10, 6))
plt.bar(sentiment_counts.index, sentiment_counts['positive'], label='Positive')
plt.bar(sentiment_counts.index, sentiment_counts['negative'], bottom=sentiment_counts['positive'], label='Negative')
plt.xlabel('Book ID')
plt.ylabel('Sentiment Count')
plt.title('Book Sentiments')
plt.legend()
plt.show()


In [None]:
# Permet de voir les sentiments par livre de la bible (positif ou négatif)
# si plus de verse positif que négatif alors le livre est positif (et inversement)

import matplotlib.pyplot as plt

# Calculate the count of positive and negative sentiments for each book_id
sentiment_counts = visualization_df.groupby('book_id')['Sentiment'].value_counts().unstack().fillna(0)

# Determine if each book_id is positive or negative based on sentiment counts
sentiment_counts['Book Sentiment'] = sentiment_counts.apply(lambda row: 'positive' if row['positive'] > row['negative'] else 'negative', axis=1)

# Assign colors based on sentiment
colors = ['green' if sentiment == 'positive' else 'red' for sentiment in sentiment_counts['Book Sentiment']]

# Plot the book sentiments with assigned colors
plt.figure(figsize=(10, 6))
plt.bar(sentiment_counts.index, sentiment_counts['positive'], color=colors)
plt.xlabel('Book ID')
plt.ylabel('Sentiment Count')
plt.title('Book Sentiments')
plt.show()
