In [1]:
import os
import json
import pandas as pd

df = pd.DataFrame()
for filename in os.listdir('section_dicts'):
    if filename.endswith('.json'):
        with open(f'section_dicts/{filename}', 'r') as file:
            section_dict = json.load(file)

    title_df = pd.DataFrame.from_dict(
        section_dict, orient='index', 
        columns=['section_text']).reset_index().rename(
        columns={'index': 'section'})
    title_df['title'] = filename[:-5]
    title_df = title_df[['title', 'section', 'section_text']]
    
    df = df.append(title_df)
    df = df.reset_index(drop=True)

print(df.shape)
df['title'].value_counts()

(6768, 3)


Remarks_on_the_Philosophy_of_Psychology_Volume_I                 1137
Last_Writings_on_the_Philosophy_of_Psychology_Volume_I            979
Remarks_on_the_Philosophy_of_Psychology_Volume_II                 737
Zettel                                                            717
Philosophical_Investigations_Part_I                               693
On_Certainty                                                      676
Philosophical_Investigations_Part_II                              372
Remarks_on_Colour_Part_III                                        350
Philosophical_Remarks                                             238
Remarks_on_the_Foundations_of_Mathematics_Part_I                  171
Philosophical_Grammar_Part_I                                      142
Remarks_on_the_Foundations_of_Mathematics_Part_III                 90
Remarks_on_Colour_Part_I                                           88
Remarks_on_the_Foundations_of_Mathematics_Part_VII                 74
Remarks_on_the_Found

In [2]:
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_lg')

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if ((token.is_stop == False) and (token.is_punct == False)
           ) and (token.pos_ != 'PRON'):
            tokens.append(token.lemma_)
    return tokens

tqdm.pandas()
df['tokens'] = df['section_text'].progress_apply(tokenize)

df.head()

  from pandas import Panel
100%|██████████| 6768/6768 [02:09<00:00, 52.23it/s]


Unnamed: 0,title,section,section_text,tokens
0,Remarks_on_the_Philosophy_of_Psychology_Volume_I,1,Let's consider what is said about such a pheno...,"[let, consider, say, phenomenon, see, figure, ..."
1,Remarks_on_the_Philosophy_of_Psychology_Volume_I,2,Is it introspection that tells me whether I ha...,"[introspection, tell, genuine, seeing, act, in..."
2,Remarks_on_the_Philosophy_of_Psychology_Volume_I,3,"I should like to say: ""I see the figure as the...","[like, figure, mirror, image, F, indirect, des..."
3,Remarks_on_the_Philosophy_of_Psychology_Volume_I,4,Suppose we were to ask someone: What similarit...,"[suppose, ask, similarity, figure, f, person, ..."
4,Remarks_on_the_Philosophy_of_Psychology_Volume_I,5,"Don't I see the figure sometimes this way, som...","[figure, way, react, word, sign, way, word, ri..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['joined_tokens'] = df['tokens'].apply(' '.join)

tfidf = TfidfVectorizer(stop_words = 'english')
sparse = tfidf.fit_transform(df['joined_tokens'])
dtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

dtm.head()

Unnamed: 0,000,000th,001001001,0127649,05,10,100,1000,10000,1000000,...,ﬁt,ﬁtte,ﬁve,ﬂat,ﬂoat,ﬂoate,ﬂoating,ﬂoats,ﬂourishe,ﬂower
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=len(df), algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6768, p=2,
                 radius=1.0)

In [5]:
query_string = (
    'What I hold fast to is not one proposition but a nest of propositions.')
query = [query_string]

new = tfidf.transform(query)

filtered_results = []
for number in nn.kneighbors(new.todense())[1].tolist()[0]:
    number_range = set(range(number, (number + 20)))
    if filtered_results == []:
        filtered_results.append(number)
    add = True
    for filtered_result in filtered_results:
        filtered_result_range = set(range(
            filtered_result, (filtered_result + 20)))
        if number_range.intersection(filtered_result_range) != set():
            add = False
            break
    if add == True:
        filtered_results.append(number)
            
for result in filtered_results[:10]:
    print(df['title'][result], df['section'][result])
    print(df['section_text'][result])
    print()

On_Certainty 225
What I hold fast to is not one proposition but a nest of propositions.

On_Certainty 116
Instead of "I know...", couldn't Moore have said: "It stands fast for me that..."? And further: "It stands fast for me and many others...."

On_Certainty 51
What sort of proposition is: "What could a mistake here be like!"? It would have to be a logical proposition. But is it a logic that is not used, because what it tells us is not taught by means of propositions.--It is a logical proposition; for it does describe the conceptual (linguistic) situation.

On_Certainty 144
The child learns to believe a host of things. I.e. it learns to act according to these beliefs. Bit by bit there forms a system of what is believed, and in that system some things stand unshakeably fast and some are more or less liable to shift. What stands fast does so, not because it is intrinsically obvious or convincing; it is rather held fast by what lies around it.

Remarks_on_the_Foundations_of_Mathematics_P

In [6]:
import pickle

pickle.dump(tfidf, open('wittgenstein_tfidf.pkl', 'wb'))
pickle.dump(nn, open('wittgenstein_nn.pkl', 'wb'))

In [7]:
df.to_csv('wittgenstein_df.csv', index=False)

In [8]:
pd.read_csv('wittgenstein_df.csv')

Unnamed: 0,title,section,section_text,tokens,joined_tokens
0,Remarks_on_the_Philosophy_of_Psychology_Volume_I,1,Let's consider what is said about such a pheno...,"['let', 'consider', 'say', 'phenomenon', 'see'...",let consider say phenomenon see figure mirror ...
1,Remarks_on_the_Philosophy_of_Psychology_Volume_I,2,Is it introspection that tells me whether I ha...,"['introspection', 'tell', 'genuine', 'seeing',...",introspection tell genuine seeing act interpre...
2,Remarks_on_the_Philosophy_of_Psychology_Volume_I,3,"I should like to say: ""I see the figure as the...","['like', 'figure', 'mirror', 'image', 'F', 'in...",like figure mirror image F indirect descriptio...
3,Remarks_on_the_Philosophy_of_Psychology_Volume_I,4,Suppose we were to ask someone: What similarit...,"['suppose', 'ask', 'similarity', 'figure', 'f'...",suppose ask similarity figure f person answer ...
4,Remarks_on_the_Philosophy_of_Psychology_Volume_I,5,"Don't I see the figure sometimes this way, som...","['figure', 'way', 'react', 'word', 'sign', 'wa...",figure way react word sign way word right use ...
...,...,...,...,...,...
6763,Remarks_on_the_Foundations_of_Mathematics_Part_VI,45,Suppose however there were a tribe whose peopl...,"['suppose', 'tribe', 'people', 'apparently', '...",suppose tribe people apparently understanding ...
6764,Remarks_on_the_Foundations_of_Mathematics_Part_VI,46,"When I say: ""If you follow the rule, this must...","['follow', 'rule', 'come', 'mean', 'come', 'fo...",follow rule come mean come foundation \n\n com...
6765,Remarks_on_the_Foundations_of_Mathematics_Part_VI,47,"""But at every step I know absolutely what I ha...","['step', 'know', 'absolutely', 'rule', 'demand...",step know absolutely rule demand rule conceive...
6766,Remarks_on_the_Foundations_of_Mathematics_Part_VI,48,"One person makes a bidding gesture, as if he m...","['person', 'make', 'bidding', 'gesture', 'mean...",person make bidding gesture mean slink frighte...
