<a href="https://colab.research.google.com/github/mkane968/Extracted-Features/blob/master/Text_Sectioning_and_Disaggregation_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Text Files and Add to DataFrame

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Selet all files to upload
from google.colab import files

uploaded = files.upload()

In [None]:
#Put files into dataframe
import pandas as pd

books = pd.DataFrame.from_dict(uploaded, orient='index')
books

In [None]:
#Reset index and add column names to make editing easier
books = books.reset_index()
books.columns = ["Title", "Text"]
books

#Clean Titles and Text 

In [None]:
#Change data type to string
books = books.astype(str)
books

In [None]:
#Remove .txt from titles
books['Title'] = books['Title'].str.replace(r'.txt', ' ', regex=True) 

In [None]:
#Remove newline characters
books['Text'] = books['Text'].str.replace(r'\s+|\\n', ' ', regex=True) 
books

In [None]:
#Remove BOM charaters 
import codecs
books['Text'] = books['Text'].str.decode("utf-8-sig")
books['Text'] = books['Text'].encode("utf-8")

In [None]:
#Remove punctuation (optional)
books['Text'] = books['Text'].str.replace(r'[^\w\s]+', '', regex = True)

#Remove numbers (optional)
books

In [None]:
#Check that strings are cleaned
books.iloc[0]['Text']

In [None]:
#Define cleaned dataframe
books_cleaned = books

#Chunk Texts by Chapter

In [None]:
#Count number of chapters in each text
chapter_counts = books_cleaned['Text'].str.count('chapter')

#Append chapter counts to dataframe
books_cleaned["Chapters"] = chapter_counts
books_cleaned

In [None]:
#Make new cell each time new chapter starts 
new = books_cleaned["Text"].str.split("chapter", expand = True).set_index(books_cleaned['Title'])
new

In [None]:
#Flatten dataframe so each chapter is on own row, designated by book and chapter 
chapters_df = new.stack().reset_index()
chapters_df.columns = ["Book", "Chapter", "Text"]
chapters_df.dropna
chapters_df

#Disaggregate Words in Each Chapter

In [None]:
#Alphabetize words in each chapter string
chapters_df['Text'] = chapters_df['Text'].apply(lambda x: ' '.join(sorted(x.split())))
chapters_df

In [None]:
#Combine book and chapter labels into one column
chapters_df['Book + Chapter'] = chapters_df['Book'].astype(str) + '_Chapter_' + chapters_df['Chapter'].astype(str)

#Remove individual book and chapter columns
chapters_df.drop(columns=['Book', 'Chapter'])

#Reindex so book + chapter is first column 
column_names = "Book + Chapter", "Text"
chapters_df = chapters_df.reindex(columns=column_names)
chapters_df

#Download CSV with Disaggregated Chapter Text

In [None]:
from google.colab import files

chapters_df.to_csv('bag_of_words_output.csv', encoding = 'utf-8-sig') 
files.download('bag_of_words_output.csv')