In [658]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, paired_euclidean_distances
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from plotly.graph_objs import *

df = pd.read_csv('metrics.csv')
df = df.dropna().reset_index().drop(columns='index')
print(df.shape)
df.head()

(19176, 19)


Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.624214,0.452214,528.796852,0.39884,20.889773,111.791794,1.735272,4.017759,10.845536,0.998584,232.7,0.640382,0.79291,0.717273,2.338342
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.585125,0.415,378.878385,0.39712,14.213115,77.20765,1.752356,2.763206,10.506952,0.997945,232.130435,0.603311,0.811363,0.718462,2.134627
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.62,0.444,1298.919399,0.428477,27.3003,146.405405,1.720949,4.384384,10.003001,0.99836,222.8,0.656081,0.777569,0.638938,2.403897
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.627,0.486,1703.701946,0.324357,8.565289,49.239669,1.79413,2.67438,9.511655,0.996898,221.166667,0.586852,0.823569,0.692679,2.098842
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.686821,0.54175,269.095482,0.372508,21.56223,122.567266,1.868661,4.204856,11.781284,0.998751,241.057143,0.716877,0.837261,0.751203,2.331626


## Recommendation System

In [471]:
## standard scaling all style metrics; each will have mean 0 and std 1
metrics = df.iloc[:, 4:].values
scaler = preprocessing.MinMaxScaler()
metrics_scaled = scaler.fit_transform(metrics)
df_scaled = pd.DataFrame(metrics_scaled, columns=df.columns[4:])
df_scaled = pd.concat([df.iloc[:,:4], df_scaled], axis=1)
df_scaled.head()

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.580262,0.421767,0.050518,0.78476,0.255357,0.229973,0.450676,0.074117,0.70858,0.99693,0.678248,0.404288,0.517212,0.502914,0.32343
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.536514,0.382376,0.035444,0.781371,0.164938,0.150698,0.461159,0.050926,0.673516,0.995543,0.676566,0.3428,0.560263,0.505006,0.247857
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.575546,0.413072,0.127953,0.843135,0.342173,0.309316,0.441888,0.080895,0.621327,0.996444,0.649016,0.430328,0.481421,0.365188,0.347749
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.58338,0.457528,0.168654,0.638047,0.088452,0.086589,0.486792,0.049284,0.570443,0.993273,0.644193,0.315501,0.588741,0.459674,0.234581
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.650332,0.516539,0.024405,0.732892,0.264464,0.254673,0.532525,0.077576,0.805486,0.997291,0.702925,0.531166,0.620684,0.56257,0.320938


In [None]:
[['sttr', 'hapax_legomenon', 'yules_k', 'function_words', 
  'avg_sentence_length_word', 'avg_sentence_length_chars', 
  'avg_syllables_per_word', 'punctuation_sentence', 
  'shannon_entropy', 'simpsons_d', 'average_nps', 'noun_to_verb', 
  'noun_to_adj', 'verb_to_adv', 'avg_dependency_distance']]

In [716]:
df_scaled_subset = df_scaled[['sttr', 'hapax_legomenon', 'yules_k',
                              'avg_sentence_length_word', 'avg_sentence_length_chars', 
                              'avg_syllables_per_word', 'punctuation_sentence', 'shannon_entropy', 
                              'simpsons_d', 'average_nps', 'noun_to_verb', 'noun_to_adj', 
                              'verb_to_adv','avg_dependency_distance'
                              ]]

In [717]:
## scaled cosine sim
cosine_sim = cosine_similarity(df_scaled_subset, df_scaled_subset)

In [590]:
cosine_sim = cosine_similarity(df_TSNE.iloc[:, 4:], df_TSNE.iloc[:, 4:])

In [718]:
indices = pd.Series(df.index, index=df['title'])

In [719]:
# takes book title as input and outputs most similar books
def get_recommendations(title, cosine_sim=cosine_sim):
    # gets index of book that matches title
    idx = indices[title]

    # gets pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sorts books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # gets scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # gets book indices
    book_indices = [i[0] for i in sim_scores]

    # returns top 10 most similar books
    return df[['author', 'title']].iloc[book_indices]

In [721]:
get_recommendations('A Tale of Two Cities')

Unnamed: 0,author,title
14545,Eugene Sue,"The Mysteries of Paris, Volume 4 of 6"
8568,Florence L. Barclay,The Following of the Star
14554,Eugene Sue,"The Mysteries of Paris, Volume 1 of 6"
10813,E. D. E. N. Southworth,The Haunted Homestead
16790,R M Ballantyne,Under the Waves
16982,R.M. Ballantyne,Fighting the Flames
9843,C. N. Williamson,Lord Loveland Discovers America
5042,Elizabeth C. Gaskell,Round the Sofa; vol. 2
14197,John Galsworthy,"The Forsyte Saga, The Man Of Property"
7014,Leo Tolstoy,The Invaders and other Stories


In [650]:
df[df.author == 'Henry James']

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
1548,project_gutenberg/898-0.txt,Henry James,The Lesson of the Master,1888,0.655667,0.501667,717.461352,0.399856,14.27516,78.265852,1.786234,1.979058,10.516844,0.998238,234.2,0.550937,0.713527,0.689624,2.286546
1727,project_gutenberg/55078.txt,Henry James,The Birthplace,1903,0.690143,0.532714,967.990872,0.408503,16.171492,88.242019,1.815834,4.342242,10.648486,0.99869,221.444444,0.543304,0.70637,0.618783,2.370881
1801,project_gutenberg/60040-0.txt,Henry James,Notes on Novelists,1914,0.746167,0.594926,286.36149,0.449153,33.467397,189.370316,2.123273,3.870073,12.005247,0.999393,224.436709,0.669764,0.702581,0.585115,2.551686
2752,project_gutenberg/59462-0.txt,Henry James,Index of the Project Gutenberg Works of Henry ...,1909,0.44625,0.28975,589.199579,0.180617,7.759945,40.106996,1.51845,1.55144,9.135852,0.995703,256.166667,0.985984,0.977872,0.844828,2.063298
2835,project_gutenberg/8080.txt,Henry James,A Passionate Pilgrim,1609,0.721273,0.580182,936.506695,0.393779,13.746507,74.931637,1.771811,3.081337,11.159966,0.998903,239.212121,0.589021,0.703575,0.704545,2.275783
3191,project_gutenberg/7813.txt,Henry James,Madame de Mauves,1874,0.706692,0.558846,733.328413,0.411967,20.140051,111.281369,1.876931,3.479087,11.061817,0.998809,238.972973,0.577513,0.715057,0.693397,2.341077
3645,project_gutenberg/58938-0.txt,Henry James,English Hours,1905,0.748455,0.603515,503.794119,0.424029,27.375587,156.225352,1.980507,3.608523,12.029171,0.999388,224.784091,0.698254,0.691256,0.633211,2.500688
3824,project_gutenberg/7529.txt,Henry James,The Reverberator,1888,0.626619,0.477619,402.078106,0.402444,14.500929,79.302202,1.824111,3.578137,10.723042,0.997868,236.656716,0.549357,0.740854,0.690978,2.279344
3991,project_gutenberg/7433.txt,Henry James,The Awkward Age,1898,0.61698,0.46216,180.494834,0.397649,11.613459,63.084685,1.817694,3.682937,10.855918,0.998006,224.40678,0.529596,0.734526,0.640692,2.2591
4429,project_gutenberg/58471-0.txt,Henry James,Partial Portraits,1888,0.707415,0.557341,353.240743,0.435108,24.013571,135.075238,2.102387,3.569762,11.853174,0.999313,237.793103,0.691084,0.717982,0.648937,2.424613


In [339]:
metrics_embedded = TSNE(n_components=3).fit_transform(df_scaled_subset)

df_TSNE = pd.concat([df.iloc[:, :4], 
                     pd.DataFrame(metrics_embedded, columns=['TSNE_1', 
                                                             'TSNE_2', 
                                                             'TSNE_3'])], axis=1)

In [428]:
f = go.Figure(data=go.Scatter3d(x=df_TSNE.TSNE_1, 
                                y=df_TSNE.TSNE_2,
                                z=df_TSNE.TSNE_3,
                                mode='markers',
                                marker=dict(size=2,
                                            color=df_TSNE['TSNE_3'], 
                                            colorscale='PRGn'),
                                text='<b>' + df_TSNE['title'] + '</b>'
                                     '<br>' + 
                                     '<br>' + df_TSNE['author'] + '<br>' +
                                      df['year'].astype('str'),
                                hovertemplate='%{text}'
                               ))

f.update_layout(title='Stylometric Analysis of Project Gutenberg',
                height=1100,
                width=1100,
                template='plotly_white')
f.show()

0        1822
1        1941
2        1972
3        1970
4        2013
         ... 
19171    1810
19172    1860
19173    1952
19174    1934
19175    1215
Name: year, Length: 19176, dtype: object