In [None]:
import pandas as pd
import re

# Import de NTLK et du corpus inaugural
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# Import bible csv
bible_df = pd.read_csv('bible.csv')

In [None]:
# Stats descriptives sur le dataframe
bible_df.rename(columns={
    'b': 'book_id', 
    'c': 'chapter_id',
    'v': 'verse_id',
    't': 'text'
    }, inplace=True)

bible_df

In [None]:
# Now, clean !

# Use lambda to apply the function to each row of the DataFrame
def remove_all_punctuation(text):
   return re.sub(r'[^\w\s]', ' ', text)

def remove_partial_punctuation(text): # except .!?
   return re.sub(r'[^\w\s.!?]', ' ', text)

def carriage_return(text):
   return re.sub(r'\n', ' ', text)

def remove_double_space(text): # remove when more than 2 spaces
   return re.sub(r'[ ]{2,}', ' ', text)

def remove_empty_strings(text):
    return list(filter(None, text))

def remove_stopwords(text):
   text = tokenize(text)
   filtered_words = [word for word in text if word.lower() not in stopwords.words('english')]
   return filtered_words

def tokenize(text):
   return text.split(' ')

def join(text):
   return ' '.join(text)

def lemmatize(text):
    WNlemma = nltk.WordNetLemmatizer()
    lemmatized = []
    for token in text:
        lemmatized.append(WNlemma.lemmatize(token))
    return lemmatized

def remove_numbers(text):
    numbers = '0123456789'
    for number in numbers:
        text = text.replace(number, '')
    return text

def remove_short_words(word_list):
    return [word for word in word_list if len(word) > 1]

In [None]:
# Text without stopwords
bible_df['cleaned'] = bible_df['text'] \
   .apply(lambda x: remove_stopwords(x)) \
   .apply(lambda x: join(x)) \
   .apply(lambda x: carriage_return(x)) \
   .apply(lambda x: remove_double_space(x)) \
   .apply(lambda x: x.lower()) \
   .apply(lambda x: remove_all_punctuation(x)) \
   .apply(lambda x: remove_numbers(x)) \
   .apply(lambda x: tokenize(x)) \
   .apply(lambda x: remove_empty_strings(x)) \
   .apply(lambda x: lemmatize(x)) \
   .apply(lambda x: remove_short_words(x))
   
# Test cleaning efficiency printing the dataframe
bible_df

In [None]:
bible_df.plot(x='book_id', y='chapter_id', kind='scatter')
# book_id = représente le livre de la bible (1 = Genèse, 2 = Exode, 3= Lévitique, etc.)


In [None]:
#ceci est un test 

# Statistiques du texte (avec visualisations)

In [None]:
# Add descriptive columns
bible_df['word_count'] = bible_df['cleaned'].apply(lambda x: len(' '.join(x).split()))
bible_df['unique_words'] = bible_df['cleaned'].apply(lambda x: len(set(x)))
bible_df['avg_word_length'] = bible_df['cleaned'].apply(lambda x: sum(len(word) for word in x) / len(x) if len(x) > 0 else 0) 
bible_df['sentence_count'] = bible_df['text'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))

bible_df



In [None]:
# Nombres de mots par livre de la bible

import matplotlib.pyplot as plt

# Graph 1: Bar Chart
plt.figure(figsize=(10, 6))
plt.bar(bible_df['book_id'], bible_df['word_count'])
plt.xlabel('Book ID')
plt.ylabel('Word Count')
plt.title('Word Count by Book')
plt.show()



In [None]:
# Top des livres par nombre de mots (top 20) pour la lisibilité
top_10_books = bible_df.groupby('book_id')['word_count'].sum().nlargest(20)
top_10_books
top_10_books.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Book ID')
plt.ylabel('Word Count')
plt.title('Top 20 Books by Word Count (66 at all)')
plt.show()


In [None]:
# Nombres de mots par livre de la bible 
plt.figure(figsize=(10, 6))
bible_df_sorted = bible_df.sort_values('avg_word_length')
plt.bar(bible_df_sorted['book_id'], bible_df_sorted['avg_word_length'])
plt.xlabel('Book ID')
plt.ylabel('Average Word Length')
plt.title('Average Word Length by Book (Descending Order)')
plt.show()


## 2. Nettoyage des données

Définition de plusieurs fonctions de nettoyage des données, et application de ces dernières sur différentes colonnes du dataframe, correspondant à un niveau de nettoyage différent qui sera utile pour les prochaines étapes. 

In [None]:
import re

# Use lambda to apply the function to each row of the DataFrame
def remove_all_punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def remove_partial_punctuation(text): # except .!?
    return re.sub(r'[^\w\s.!?]', ' ', text)

def carriage_return(text):
    return re.sub(r'\n', ' ', text)

def remove_double_space(text): # remove when more than 2 spaces
    return re.sub(r'[ ]{2,}', ' ', text)

def remove_empty_strings(text):
     return list(filter(None, text))

def remove_stopwords(text):
    text = tokenize(text)
    filtered_words = [word for word in text if word.lower() not in stopwords.words('english')]
    return filtered_words

def tokenize(text):
    return text.split(' ')

def join(text):
    return ' '.join(text)

def lemmatize(text):
     WNlemma = nltk.WordNetLemmatizer()
     lemmatized = []
     for token in text:
          lemmatized.append(WNlemma.lemmatize(token))
     return lemmatized

def remove_numbers(text):
     numbers = '0123456789'
     for number in numbers:
          text = text.replace(number, '')
     return text

def remove_single_letter_words(text):
    filtered_words = [word for word in text if len(word) > 1]
    return filtered_words

bible_df_cleaned = bible_df.copy()

# Text without stopwords
bible_df_cleaned['text_without_stopwords'] = bible_df_cleaned['text'] \
    .apply(lambda x: remove_stopwords(x)) \
    .apply(lambda x: join(x)) \
    .apply(lambda x: carriage_return(x)) \
    .apply(lambda x: remove_double_space(x))

# Clean text partially for later
bible_df_cleaned['text_partially_cleaned'] = bible_df_cleaned['text_without_stopwords'] \
    .apply(lambda x: remove_partial_punctuation(x)) \
    .apply(lambda x: remove_double_space(x)) \
    .apply(lambda x: x.lower()) \

# Advanced cleaning
bible_df_cleaned['text'] = bible_df_cleaned['text_partially_cleaned'] \
    .apply(lambda x: remove_all_punctuation(x)) \
    .apply(lambda x: remove_numbers(x)) \
    .apply(lambda x: remove_double_space(x)) \
    .apply(lambda x: tokenize(x)) \
    .apply(lambda x: remove_single_letter_words(x)) \
    .apply(lambda x: remove_empty_strings(x)) \
    .apply(lambda x: lemmatize(x))

# Test cleaning efficiency printing the dataframe
bible_df_cleaned


In [None]:
# Test cleaning efficiency printing the first row
# cela permet de comparer les différentes étapes de nettoyage

print('Cleaned text: \n' + str(bible_df_cleaned['text'][0]))
print('Partially cleaned text (without puncutation except points):\n' + str(bible_df_cleaned['text_partially_cleaned'][0]))
print('Text without stopwords and carriage return: \n' + str(bible_df_cleaned['text_without_stopwords'][0]))


## 3.Analyse des données

### Analyse des mots les plus fréquents

In [None]:
from collections import Counter
combined_common_words = []

# Print the most common sorted words
for index, text in enumerate(bible_df_cleaned['text']):
    vocab = Counter(text)
    most_common_words = vocab.most_common()
    select_freq_words = [w for w in most_common_words if len(w[0]) > 3 and w[1] > 1]
    sorted_select_freq_words = sorted(select_freq_words, key=lambda x: x[1], reverse=True)
    
    bible_df_cleaned.loc[index, 'most_common_words'] = str(sorted_select_freq_words)
   
    # Append word only to combined_common_words 
    for word in sorted_select_freq_words:
        combined_common_words.append(word[0])

combined_common_words = sorted(set(combined_common_words))
print(combined_common_words)
print('Unique repeated words count: ' + str(len(combined_common_words)))




### Création d'un dataframe contenant les mots les plus fréquents
Ce n'était pas forcément demandé, et cela rejoint un peu le BOW qui sera fait plus tard d'une manière plus simple, mais cela permet de visualiser les mots les plus fréquents et de les comparer entre eux.

In [None]:
from collections import Counter

# Create an empty dataframe
word_df = pd.DataFrame()

# Iterate over each row in the cleaned text column
for index, text in enumerate(bible_df_cleaned['text']):
    # Count the frequency of each word in the text
    word_counts = Counter(text)
    
    # Create a dictionary with the word as the key and its frequency as the value
    word_dict = {word: count for word, count in word_counts.items()}
    
    # Create a temporary dataframe with the word frequencies
    temp_df = pd.DataFrame.from_dict(word_dict, orient='index', columns=[f'Phrase {index+1}'])
    
    # Concatenate the temporary dataframe with the word dataframe
    word_df = pd.concat([word_df, temp_df], axis=1)

# Fill NaN values with 0
word_df = word_df.fillna(0)

# Sort the dataframe by the sum of word frequencies in descending order
word_df = word_df.reindex(word_df.sum().sort_values(ascending=False).index, axis=1)

# Print the word dataframe
word_df
