In [4]:
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import pandas as pd 
import numpy as np


In [7]:
df = pd.read_csv('../metrics.csv')
print(df.shape)
df.head()

FileNotFoundError: [Errno 2] File b'../metrics.csv' does not exist: b'../metrics.csv'

## Distribution of Style Metrics

In [6]:
f = ff.create_distplot([df['sttr'], df['hapax_legomenon']], 
                       ['Standardized Type-Token<br>Ratio', 'Hapax Legomenon'], 
                       bin_size = .01, colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)'])
f.update_layout(title_text='Measures of Vocabulary Richness')
f.show()
pio.write_image(f, 'sttr_hapax.png', format='png', scale=2)

ValueError: 
The orca executable is required in order to export figures as static images,
but the executable that was found at '/home/michaeleby1/anaconda3/envs/gutenberg/bin/orca'
does not seem to be a valid plotly orca executable. Please refer to the end of
this message for details on what went wrong.

If you haven't installed orca yet, you can do so using conda as follows:

    $ conda install -c plotly plotly-orca

Alternatively, see other installation methods in the orca project README at
https://github.com/plotly/orca

After installation is complete, no further configuration should be needed.

If you have installed orca, then for some reason plotly.py was unable to
locate it. In this case, set the `plotly.io.orca.config.executable`
property to the full path of your orca executable. For example:

    >>> plotly.io.orca.config.executable = '/path/to/orca'

After updating this executable property, try the export operation again.
If it is successful then you may want to save this configuration so that it
will be applied automatically in future sessions. You can do this as follows:

    >>> plotly.io.orca.config.save()

If you're still having trouble, feel free to ask for help on the forums at
https://community.plot.ly/c/api/python

Here is the error that was returned by the command
    $ /usr/bin/xvfb-run --auto-servernum --server-args -screen 0 640x480x24 +extension RANDR +extension GLX /home/michaeleby1/anaconda3/envs/gutenberg/bin/orca --help

[Return code: 127]

Note: When used on Linux, orca requires an X11 display server, but none was
detected. Please install Xvfb and configure plotly.py to run orca using Xvfb
as follows:

    >>> import plotly.io as pio
    >>> pio.orca.config.use_xvfb = True
    
You can save this configuration for use in future sessions as follows:

    >>> pio.orca.config.save() 
    
See https://www.x.org/releases/X11R7.6/doc/man/man1/Xvfb.1.xhtml
for more info on Xvfb


In [None]:
f = ff.create_distplot([df['yules_k']], 
                       ['Yule\'s K Characteristic'], 
                       bin_size = 25, colors = ['rgb(50, 50, 255)'])
f.update_layout(title_text='')
f.update_xaxes(range=[-100, 5000])
f.show()
pio.write_image(f, 'plots/yules_k.png', format='png', scale=2)

In [None]:
f = ff.create_distplot([df['avg_sentence_length_word']], 
                       ['Average Sentence<br>Length in Words'], 
                       bin_size = .75, colors = ['rgb(50, 50, 255)'])
f.update_layout(title_text='Measures of Lexical Complexity')
f.show()
pio.write_image(f, 'plots/length_words.png', format='png', scale=2)

In [None]:
f = ff.create_distplot([df['avg_sentence_length_chars']], 
                       ['Average Sentence<br>Length in Characters'], 
                       bin_size = 5, colors = ['rgb(255, 0, 125)'])
f.update_layout(title_text='')
f.show()
pio.write_image(f, 'plots/length_chars.png', format='png', scale=2)

In [None]:
f = ff.create_distplot([df['avg_syllables_per_word']], 
                       ['Average Number of<br>Syllables Per Word'], 
                       bin_size = .025, colors = ['rgb(0, 200, 200)'])
f.update_layout(title_text='Measures of Lexical Complexity')
f.show()
pio.write_image(f, 'plots/syllables.png', format='png', scale=2)

In [None]:
f = ff.create_distplot([df['noun_to_verb'], df['noun_to_adj'], df['verb_to_adv']], 
                       ['Noun to Verb Ratio', 'Noun to Adjective Ratio', 'Verb to Adverb Ratio'], 
                       bin_size = .0075, colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)', 'rgb(50, 50, 255)'])
f.update_layout(title_text='Part of Speech Ratios')
f.show()
pio.write_image(f, 'plots/pos_ratios.png', format='png', scale=2)

## Style Metrics Across Time

In [None]:
df_time = df.groupby('year').agg({'sttr': 'mean', 'hapax_legomenon': 'mean', 'yules_k': 'mean',
                                  'function_words': 'mean', 'avg_sentence_length_word': 'mean',
                                  'avg_sentence_length_chars': 'mean', 'avg_syllables_per_word': 'mean',
                                  'punctuation_sentence': 'mean', 'shannon_entropy': 'mean',
                                  'simpsons_d': 'mean', 'average_nps': 'mean', 'noun_to_verb': 'mean',
                                  'noun_to_adj': 'mean', 'verb_to_adv': 'mean', 
                                  'avg_dependency_distance': 'mean'})
df_time = df_time.reset_index()
df_time['year'] = df_time['year'].astype('str')
df_time = df_time[df_time['year'] >= '1700']
df_time['year'] = pd.to_datetime(df_time['year'], yearfirst=True, errors='coerce')
df_time.set_index('year', inplace=True)
df_time = df_time.resample('10y').mean()
df_time.head(10)

In [None]:
f = go.Figure()
f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.noun_to_verb, 
                       line_color='rgb(0, 0, 100)', 
                       name='Noun to Verb Ratio'))

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.noun_to_adj, 
                       line_color='rgb(0, 200, 200)', 
                       name='Noun to Adjective Ratio'))

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.verb_to_adv, 
                       line_color='rgb(0, 100, 50)', 
                       name='Verb to Adverb Ratio'))

f.update_layout(title_text='Part of Speech Ratios', 
                yaxis=dict(title_text=''),
                xaxis=dict(title_text=''),
                height=600,
                width=1100)

f.update_yaxes(side='left')
f.show()
pio.write_image(f, 'plots/pos_time.png', format='png', scale=2)

In [None]:
f = go.Figure()
f = make_subplots(rows=2, cols=2,
                  subplot_titles=(
                      'Average Sentence Length in Words', 
                      'Average Sentence Length in Characters', 
                      'Average Number of Syllables Per Word', 
                      'Standardized Type Token Ratio'))

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.avg_sentence_length_word, 
                       line_color='rgb(10, 150, 250)', 
                       showlegend=False,
                       name='Average Sentence Length in Words'),
            row=1, 
            col=1)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.avg_sentence_length_chars, 
                       line_color='rgb(100, 150, 20)',
                       showlegend=False,
                       name='Average Sentence Length in Characters'),
            row=1, 
            col=2)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.avg_syllables_per_word, 
                       line_color='rgb(0, 200, 200)', 
                       showlegend=False,
                       name='Average Number of Syllables Per Word'),
            row=2, 
            col=1)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.sttr, 
                       line_color='rgb(0, 0, 100)',
                       showlegend=False,
                       name='Standardized Type Token Ratio'),
            row=2, 
            col=2)

f.update_layout(height=800,
                width=1400)

f.update_xaxes(range=['1730-01-01','2020-01-01'])
f.show()
pio.write_image(f, 'plots/style_time_1.png', format='png', scale=2)

In [None]:
f = go.Figure()
f = make_subplots(rows=2, cols=2,
                  subplot_titles=(
                      'Average Dependency Distance', 
                      'Function Words', 
                      'Hapax Legomenon', 
                      'Entropy'))

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.avg_dependency_distance, 
                       line_color='rgb(255, 0, 125)', 
                       showlegend=False,
                       name='Average Dependency Distance'),
           row=1,
           col=1)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.function_words, 
                       line_color='rgb(0, 100, 50)', 
                       showlegend=False,
                       name='Function Words'),
           row=1,
           col=2)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.hapax_legomenon, 
                       line_color='rgb(0, 200, 200)', 
                       showlegend=False,
                       name='Hapax Legomenon'),
           row=2,
           col=1)

f.add_trace(go.Scatter(x=df_time.index, 
                       y=df_time.shannon_entropy, 
                       line_color='rgb(0, 0, 100)',
                       showlegend=False,
                       name='Entropy'),
            row=2, 
            col=2)

f.update_layout(height=800,
                width=1400)

f.update_xaxes(range=['1730-01-01','2020-01-01'])
f.show()
pio.write_image(f, 'plots/style_time_2.png', format='png', scale=2)

In [None]:
dickens_df = df[df['author'] == 'Charles Dickens']
stein_df = df[df['author'] == 'Gertrude Stein']
dickens_stein_df = pd.concat([dickens_df, stein_df])
dickens_stein_df = dickens_stein_df.groupby('author').agg({'sttr': 'mean', 
                                                           'noun_to_verb': 'mean',
                                                           'noun_to_adj': 'mean',
                                                           'shannon_entropy': 'mean',
                                                           'avg_syllables_per_word': 'mean',
                                                           'avg_dependency_distance': 'mean'}).reset_index()

In [None]:
dickens_stein_df

In [None]:
f = go.Figure()

f = go.Figure(data=[
    go.Bar(
        x=dickens_stein_df.author, 
        y=dickens_stein_df.sttr,
        name='Standardized Type-Token<BR>Ratio',
        marker_color='rgb(0, 0, 100)'),
    
    go.Bar(
         x=dickens_stein_df.author, 
         y=dickens_stein_df.noun_to_verb,
         name='Noun to Verb Ratio',
         marker_color='rgb(0, 200, 200)'),
    
    
    go.Bar(
         x=dickens_stein_df.author, 
         y=dickens_stein_df.noun_to_adj,
         name='Noun to Adjective Ratio',
         marker_color='rgb(50, 50, 255)'),
    
    ])

f.update_layout(height=600,
                width=1100,
                xaxis_tickfont_size=18,
                title_text='Charles Dickens and Gertrude Stein')


f.show()
pio.write_image(f, 'plots/dickens_stein_1.png', format='png', scale=2)

In [None]:
f = go.Figure()

f = go.Figure(data=[
    go.Bar(
        x=dickens_stein_df.author, 
        y=dickens_stein_df.avg_dependency_distance,
        name='Average Dependency<br>Distance',
        showlegend=True,
        marker_color='rgb(255, 0, 125)'),
    
    go.Bar(
        x=dickens_stein_df.author, 
        y=dickens_stein_df.avg_syllables_per_word,
        name='Average Number of<br>Syllables Per Word',
        showlegend=True,
        marker_color='rgb(0, 0, 100)'),
    ])

f.update_layout(height=600,
                width=1100,
                xaxis_tickfont_size=18)



f.show()
pio.write_image(f, 'plots/dickens_stein_2.png', format='png', scale=2)

In [None]:
dickens_df