In [36]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, metrics
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, paired_euclidean_distances
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from plotly.graph_objs import *

df = pd.read_csv('metrics.csv')
print(df.shape)
df.head()

(22481, 19)


Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.624214,0.452214,528.796852,0.39884,20.889773,111.791794,1.735272,4.017759,10.845536,0.998584,232.7,0.640382,0.79291,0.717273,2.338342
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.585125,0.415,378.878385,0.39712,14.213115,77.20765,1.752356,2.763206,10.506952,0.997945,232.130435,0.603311,0.811363,0.718462,2.134627
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.62,0.444,1298.919399,0.428477,27.3003,146.405405,1.720949,4.384384,10.003001,0.99836,222.8,0.656081,0.777569,0.638938,2.403897
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.627,0.486,1703.701946,0.324357,8.565289,49.239669,1.79413,2.67438,9.511655,0.996898,221.166667,0.586852,0.823569,0.692679,2.098842
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.686821,0.54175,269.095482,0.372508,21.56223,122.567266,1.868661,4.204856,11.781284,0.998751,241.057143,0.716877,0.837261,0.751203,2.331626


## Recommendation System

In [2]:
## standard scaling all style metrics; each will have mean 0 and std 1
metrics = df.iloc[:, 4:].values
scaler = preprocessing.MinMaxScaler()
metrics_scaled = scaler.fit_transform(metrics)
df_scaled = pd.DataFrame(metrics_scaled, columns=df.columns[4:])
df_scaled = pd.concat([df.iloc[:,:4], df_scaled], axis=1)
df_scaled.head()

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.582054,0.424011,0.050913,0.770105,0.255357,0.229973,0.450676,0.073988,0.704748,0.996935,0.670464,0.404288,0.517212,0.502914,0.32343
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.53817,0.384411,0.035721,0.766781,0.164938,0.150698,0.461159,0.050793,0.669873,0.995548,0.668741,0.3428,0.560263,0.505006,0.247857
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.577322,0.41527,0.128954,0.827354,0.342173,0.309316,0.441888,0.080766,0.617966,0.996449,0.640524,0.430328,0.481421,0.365188,0.347749
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.585181,0.459963,0.169973,0.626222,0.088452,0.086589,0.486792,0.049151,0.567358,0.993278,0.635585,0.315501,0.588741,0.459674,0.234581
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.65234,0.519287,0.024596,0.719238,0.264464,0.254673,0.532525,0.077447,0.80113,0.997296,0.695737,0.531166,0.620684,0.56257,0.320938


In [None]:
[['sttr', 'hapax_legomenon', 'yules_k', 'function_words', 
  'avg_sentence_length_word', 'avg_sentence_length_chars', 
  'avg_syllables_per_word', 'punctuation_sentence', 
  'shannon_entropy', 'simpsons_d', 'average_nps', 'noun_to_verb', 
  'noun_to_adj', 'verb_to_adv', 'avg_dependency_distance']]

In [3]:
df_scaled_subset = df_scaled[['sttr', 'hapax_legomenon', 'yules_k',
                              'avg_sentence_length_word', 'avg_sentence_length_chars', 
                              'avg_syllables_per_word', 'punctuation_sentence', 'shannon_entropy', 
                              'simpsons_d', 'average_nps', 'noun_to_verb', 'noun_to_adj', 
                              'verb_to_adv','avg_dependency_distance'
                              ]]

In [4]:
## scaled cosine sim
cosine_sim = cosine_similarity(df_scaled_subset, df_scaled_subset)

In [5]:
indices = pd.Series(df.index, index=df['title'])

In [103]:
# takes book title as input and outputs most similar books
def get_recommendations(title, cosine_sim=cosine_sim):
    # gets index of book that matches title
    idx = indices[title]

    # gets pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sorts books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # gets scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # gets book indices
    book_indices = [i[0] for i in sim_scores]

    # returns top 10 most similar books
    return df[['author', 'title']].iloc[book_indices], df[df['title'] == title]

In [105]:
recs, input_ = get_recommendations('The Logic of Hegel')

In [110]:
input_

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
1647,project_gutenberg/55108-0.txt,G. W. F. Hegel,The Logic of Hegel,1770,0.579323,0.416516,159.650794,0.431336,20.615007,120.761814,2.30144,3.710745,11.185264,0.998725,226.19774,0.718977,0.732031,0.613337,2.319899


In [109]:
recs

Unnamed: 0,author,title
11307,Georg Wilhelm Friedrich Hegel,Hegel’s Philosophy of Mind
4893,August Weismann,"Studies in the Theory of Descent, Volume II (o..."
15548,Frederick Engels,The Origin of the Family Private Property and ...
13338,William E. Kellicott,The Social Direction of Evolution
16955,Ernst Haeckel,Evolution in Modern Thought
7096,Norman Kemp Smith,A Commentary to Kant's 'Critique of Pure Reason'
17371,William Platt Ball,Are the Effects of Use and Disuse Inherited?
12940,Friedrich Engels,Landmarks of Scientific Socialism
19754,St. George Mivart,On the Genesis of Species
17661,A.C. Seward and Others,Darwin and Modern Science


## Clustering

In [111]:
X = df_scaled_subset

In [45]:
## elbow method
distorsions = []

# Calculate SSE for different k
for k in range(2, 30):
    kmeans = KMeans(n_clusters=k, random_state = 301)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)

In [46]:
f = go.Figure()
f.add_trace(go.Scatter(x=list(range(2, 30)), 
                       y=distorsions, 
                       line_color='rgb(0, 200, 200)', 
                       showlegend=False,
                       name='Elbow curve'
                      ))

f.update_layout(title='Elbow curve',
                height=600,
                width=500)

f.update_xaxes(title='k',
               nticks=29)

f.update_yaxes(title='WSS')

f.show()

In [42]:
## silhouette score 
silhouette_plot = []
for k in range(2, 30):
    clusters = KMeans(n_clusters=k, random_state=10)
    cluster_labels = clusters.fit_predict(X)
    silhouette_avg = metrics.silhouette_score(X, cluster_labels)
    silhouette_plot.append(silhouette_avg)

In [43]:
f = go.Figure()
f.add_trace(go.Scatter(x=list(range(2, 30)), 
                       y=silhouette_plot, 
                       line_color='rgb(0, 0, 100)', 
                       showlegend=False,
                       name='Silhouette coefficients over k'
                      ))

f.update_layout(title='Silhouette coefficients over k',
                height=600,
                width=500)

f.update_xaxes(title='k',
               nticks=29)

f.update_yaxes(title='silhouette coefficient')

f.show()

In [62]:
## performing k-means clustering
k_means = KMeans(n_clusters=7)
k_means.fit(X)
clusters = k_means.predict(X)

In [92]:
## TSNE for dimensionality reduction
metrics_embedded = TSNE(n_components=3).fit_transform(df_scaled_subset)

df_TSNE = pd.concat([df.iloc[:, :4], 
                     pd.DataFrame(metrics_embedded, columns=['TSNE_1', 
                                                             'TSNE_2', 
                                                             'TSNE_3'])], axis=1)

In [96]:
## visualizing clusters
f = go.Figure(data=go.Scatter3d(x=df_TSNE.TSNE_1, 
                                y=df_TSNE.TSNE_2,
                                z=df_TSNE.TSNE_3,
                                mode='markers',
                                marker=dict(size=2,
                                            color=clusters, 
                                            colorscale='PRGn', 
                                            reversescale=True,
                                            opacity=.5),
                                text='<b>' + df_TSNE['title'] + '</b>'
                                     '<br>' + 
                                      df_TSNE['author'] + ', ' +
                                      df['year'].astype('str'),
                                hovertemplate='%{text} <extra></extra>',
                                hoverlabel=dict(bgcolor='black',
                                                align='left')
                               ))

f.update_layout(title='Stylometric Analysis of Project Gutenberg',
                height=1100,
                width=1100,
                template='plotly_white')
f.show()