In [1]:
with open(file='/kaggle/input/melville-moby-dick/melville-moby_dick.txt', mode='r', encoding='utf-8') as input_fp:
    lines = input_fp.readlines()
print(len(lines))

22423


In [2]:
import pandas as pd
starts = [index for index, line in enumerate(lines) if line.startswith('CHAPTER')]
documents = []
for index, start in enumerate(starts):
    if index != len(starts)-1:
        end = starts[index+1]
    else:
        end = -1
    document = ' '.join(lines[start:end])
    documents.append(document)
document_df = pd.DataFrame(data={'chapter': documents})
document_df['length (word)'] = document_df['chapter'].apply(func=lambda x: len(x.split()))
document_df['length (char)'] = document_df['chapter'].str.len()
document_df['ratio'] = document_df['length (char)'] / document_df['length (word)']
document_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   chapter        135 non-null    object 
 1   length (word)  135 non-null    int64  
 2   length (char)  135 non-null    int64  
 3   ratio          135 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 4.3+ KB


In [3]:
from plotly.express import scatter
scatter(data_frame=document_df, x='length (word)', y='length (char)',  marginal_x='box', marginal_y='box', log_x=True, log_y=True, hover_name=document_df.index.tolist(), color=document_df.index.tolist(), height=800,)

Our chapters vary in length regardless of how we measure them lexically, and either words or characters ranks our chapters in much the same way.

In [4]:
from plotly.express import histogram
histogram(data_frame=document_df, x='ratio', nbins=65)

In [5]:
document_df['ratio'].mean(), document_df['ratio'].median()

(5.873533235336259, 5.865627319970304)

Mean token length on a chapter basis doesn't vary much; most ratios cluster pretty tightly around the mean.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

counts = []
for document in documents:
    model = CountVectorizer(encoding='utf-8', stop_words='english', min_df=1, max_df=1.0, lowercase=True, ngram_range=(1, 1))
    count_result = model.fit_transform(raw_documents=document.split('\n'))
    names = model.get_feature_names_out().tolist()
    occurences = count_result.toarray().sum(axis=0).tolist()
    counts.append(pd.Series(data={name: occurences[index] for index, name in enumerate(names)}))

count_df = pd.DataFrame(data=counts).fillna(value=0)
count_df.head()

Unnamed: 0,abandon,abominate,absent,account,act,activity,affghanistan,afternoon,ago,ah,...,upheaved,veer,ventilated,voiced,voicelessly,vortex,weeps,whelmings,wink,writhed
0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# let's pick ten mostly nouns out of the top twenty words
total_df = count_df.sum(axis=0).to_frame().reset_index()
words = total_df.sort_values(ascending=False, by=0).head(n=20)['index'].tolist()
words = [word for word in words if word not in {'like', 'man', 'ye', 'said', 'way', 'thou', 'did', 'old', 'long', 'great'}]
words

['whale',
 'ahab',
 'ship',
 'sea',
 'head',
 'time',
 'boat',
 'captain',
 'white',
 'stubb']

In [8]:
from plotly.express import scatter
scatter(data_frame=count_df, y=words, log_y=True)

Because top word counts vary so much from chapter to chapter this looks more like noise than curves. Maybe fewer words and actual curves?

In [9]:
from plotly.express import line
words = ['whale', 'ahab', ]
line(data_frame=count_df, y=words,)

It's a story about Ahab and a whale, and they take turns being the most important thing in a chapter.

In [10]:
words = ['ye', 'thou', ]
line(data_frame=count_df, y=words,)

In [11]:
from plotly.express import scatter
top_df = pd.concat(axis=1, objs=[count_df.idxmax(axis=1), count_df.max(axis=1)])
scatter(data_frame=top_df, x=top_df.index, y=1, text=0, log_y=True, ).update_traces(marker=dict(size=1))

If we just look at the top word from each chapter as a function of the chapter we see something close to what we want to see.