In [1]:
from spacy.lang.en import English
import pandas as pd

filename = 'oscar_wilde_intentions.txt'

with open(filename) as file:
    raw_text = file.read()
    
raw_text = raw_text.replace('CYRIL.', 'CYRIL:').replace(
    'VIVIAN.', 'VIVIAN:').replace('GILBERT.', 'GILBERT:').replace(
    'ERNEST.', 'ERNEST:').replace('_', '')
    
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(raw_text)
sentences = [' '.join(sent.string.split()).strip() for sent in doc.sents]
sentences = sentences[8:2559]

sentences[:50]

['Persons: Cyril and Vivian.',
 'Scene: the Library of a country house in Nottinghamshire.',
 'CYRIL (coming in through the open window from the terrace).',
 'My dear Vivian, don’t coop yourself up all day in the library.',
 'It is a perfectly lovely afternoon.',
 'The air is exquisite.',
 'There is a mist upon the woods, like the purple bloom upon a plum.',
 'Let us go and lie on the grass and smoke cigarettes and enjoy Nature.',
 'VIVIAN: Enjoy Nature!',
 'I am glad to say that I have entirely lost that faculty.',
 'People tell us that Art makes us love Nature more than we loved her before; that it reveals her secrets to us; and that after a careful study of Corot and Constable we see things in her that had escaped our observation.',
 'My own experience is that the more we study Art, the less we care for Nature.',
 'What Art really reveals to us is Nature’s lack of design, her curious crudities, her extraordinary monotony, her absolutely unfinished condition.',
 'Nature has good inte

In [2]:
import pandas as pd

df = pd.DataFrame([(f'oscar_wilde_intentions', sentences[i]) for i in 
                   range(len(sentences))], columns=['text', 'sentence'])

df.head()

Unnamed: 0,text,sentence
0,oscar_wilde_intentions,Persons: Cyril and Vivian.
1,oscar_wilde_intentions,Scene: the Library of a country house in Notti...
2,oscar_wilde_intentions,CYRIL (coming in through the open window from ...
3,oscar_wilde_intentions,"My dear Vivian, don’t coop yourself up all day..."
4,oscar_wilde_intentions,It is a perfectly lovely afternoon.


In [3]:
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_lg')

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if ((token.is_stop == False) and (token.is_punct == False)
           ) and (token.pos_ != 'PRON'):
            tokens.append(token.lemma_)
    return tokens

tqdm.pandas()
df['tokens'] = df['sentence'].progress_apply(tokenize)

df.head()

  from pandas import Panel
100%|██████████| 2551/2551 [01:30<00:00, 28.19it/s]


Unnamed: 0,text,sentence,tokens
0,oscar_wilde_intentions,Persons: Cyril and Vivian.,"[person, Cyril, Vivian]"
1,oscar_wilde_intentions,Scene: the Library of a country house in Notti...,"[scene, Library, country, house, Nottinghamshire]"
2,oscar_wilde_intentions,CYRIL (coming in through the open window from ...,"[CYRIL, come, open, window, terrace]"
3,oscar_wilde_intentions,"My dear Vivian, don’t coop yourself up all day...","[dear, Vivian, coop, day, library]"
4,oscar_wilde_intentions,It is a perfectly lovely afternoon.,"[perfectly, lovely, afternoon]"


In [7]:
for i in range(20):
    df[f'sentence_{i}'] = df["sentence"].shift(-i)
    df[f'tokens_{i}'] = df["tokens"].shift(-i).apply(
        lambda x: x if isinstance(x, list) else [])
    df[f'sentence_{i}'] = df[f'sentence_{i}'].fillna('').astype(str)
    
df['range'] = [(f'{i}-{i + 19}') for i in df.index]
df['passage'] = df[[f'sentence_{i}' for i in range(20)]].agg(' '.join, axis=1)

df['tokens'] = df[[f'tokens_{i}' for i in range(20)]].values.tolist()
df['tokens'] = df['tokens'].apply(
    lambda x: [item for sublist in x for item in sublist])

df = df[['text', 'range', 'passage', 'tokens']]

df.head()

Unnamed: 0,text,range,passage,tokens
0,oscar_wilde_intentions,0-19,Persons: Cyril and Vivian. Scene: the Library ...,"[person, Cyril, Vivian, scene, Library, countr..."
1,oscar_wilde_intentions,1-20,Scene: the Library of a country house in Notti...,"[scene, Library, country, house, Nottinghamshi..."
2,oscar_wilde_intentions,2-21,CYRIL (coming in through the open window from ...,"[CYRIL, come, open, window, terrace, dear, Viv..."
3,oscar_wilde_intentions,3-22,"My dear Vivian, don’t coop yourself up all day...","[dear, Vivian, coop, day, library, perfectly, ..."
4,oscar_wilde_intentions,4-23,It is a perfectly lovely afternoon. The air is...,"[perfectly, lovely, afternoon, air, exquisite,..."


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['joined_tokens'] = df['tokens'].apply(' '.join)

tfidf = TfidfVectorizer(stop_words = 'english')
sparse = tfidf.fit_transform(df['joined_tokens'])
dtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

dtm.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,000,10,12,12th,1485,14th,1645,1794,18,1803,...,ηδονή,θεωρητικος,κάθαρσις,καλος,λαμπροτάτου,μονόχρονος,οινοψ,πόντος,των,χαρμιδης
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=len(sentences), algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=2551, p=2,
                 radius=1.0)

In [10]:
query_string = 'Life imitates Art'
query = [query_string]

new = tfidf.transform(query)

filtered_results = []
for number in nn.kneighbors(new.todense())[1].tolist()[0]:
    number_range = set(range(number, (number + 20)))
    if filtered_results == []:
        filtered_results.append(number)
    add = True
    for filtered_result in filtered_results:
        filtered_result_range = set(range(
            filtered_result, (filtered_result + 20)))
        if number_range.intersection(filtered_result_range) != set():
            add = False
            break
    if add == True:
        filtered_results.append(number)
            
for result in filtered_results[:10]:
    print(df['text'][result], df['range'][result])
    print(df['passage'][result])
    print()

oscar_wilde_intentions 608-627
In no case does it reproduce its age. To pass from the art of a time to the time itself is the great mistake that all historians commit. The second doctrine is this. All bad art comes from returning to Life and Nature, and elevating them into ideals. Life and Nature may sometimes be used as part of Art’s rough material, but before they are of any real service to art they must be translated into artistic conventions. The moment Art surrenders its imaginative medium it surrenders everything. As a method Realism is a complete failure, and the two things that every artist should avoid are modernity of form and modernity of subject-matter. To us, who live in the nineteenth century, any century is a suitable subject for art except our own. The only beautiful things are the things that do not concern us. It is, to have the pleasure of quoting myself, exactly because Hecuba is nothing to us that her sorrows are so suitable a motive for a tragedy. Besides, it is o