In [34]:
import re
import os
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
Path.cwd()

WindowsPath('C:/Users/lakj/Documents/GitHub/Lighting in French Literature/code')

# Read the data from a csv file

In [6]:
input_dir = Path.cwd() / '../data/csv_files' # input directory
df = pd.read_csv(input_dir /'text_data230826.csv', sep='|')

In [7]:
df

Unnamed: 0,year,author,title,text,clean_text
0,1838,Balzac,Splendeurs et misères des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,splendeurs et misèresdes courtisanes laissez-m...
1,1838,Balzac,Splendeurs et misères des courtisanes 2 A comb...,"Depuis huit jours, Nucingen allait marchander ...",depuis huit jours nucingen allait marchander l...
2,1838,Balzac,Splendeurs et misères des courtisanes 3 Où mèn...,"Le lendemain, à six heures, deux voitures mené...",le lendemain à six heures deux voitures menées...
3,1838,Balzac,Splendeurs et misères des courtisanes 4 La der...,"— Qu’y a-t-il, Madeleine? dit madame Camusot e...",qu’y a-t-il madeleine dit madame camusot en vo...
4,1842,Balzac,La Rabouilleuse,"LES CÉLIBATAIRES. Voici, mon cher Nodier, un o...",les célibataires voici mon cher nodier un ouvr...
...,...,...,...,...,...
96,1910,Colette,La Vagabonde,LA VAGABONDE \n\n\n\nParu dans Le Livre de Poc...,la vagabonde paru dans le livre de poche l'ing...
97,1910,DelarueMardrus,Comme tout le monde,Première partie C’est un wagon de deuxième...,première partie c’est un wagon de deuxième cla...
98,1910,Maupassant,Les Dimanches d'un bourgeois à Paris,"Monsieur Patissot, né à Paris, après avoir fai...",monsieur patissot né à paris après avoir fait ...
99,1911,Daudet,Rose et Ninette,"au poëte et au philosophe, je dédie cette page...",au poëte et au philosophe je dédie cette page ...


# Build a Word2vec language models with the Gensim library

Word2Vec is an algorithm that learns relationships between words using large amounts of text. The Word2Vec algorithm produces a language model where words with similar meanings based on context are close together and words with different meanings based on context are far apart. For example, Copenhagen and Denmark would be close together, while Copenhagen and cheese would be relatively far apart.

Word2vec models can thus be used to find words that are similar in meaning and syntactic position and that have a common relationship.

To build the Word2vec models, you can the library Gensim, which stands for generating similarities. Building the models is a labor-intensive process that takes many hours, so once the model is built, it is a good idea to save it for later use. Documentation for and description of how models are built and saved can be found on this page: https://radimrehurek.com/gensim/auto_examples/index.html

Below we'll build a model of the corpus for French literature. One can debate whether it is responsible to build a model from novels from a wide year span, or whether a division into subsets would make more sense. For example would this model not take into account the historical and cultural changes of terms through the period.

In [8]:
# Build a GenSim language models

#Sources:
## https://radimrehurek.com/gensim/auto_examples/index.html#documentation

## https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/

## https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.most_similar.html

import nltk
from gensim.models import Word2Vec
import time

def build_w2v_model(clean_text):
    
    # data preperation
    
    ## split the text into sentences using the sent_tokenizer of the nltk library
    sent_list = nltk.sent_tokenize(clean_text) 
    
    ## split each sentence into list of words using the nltk word_tokenizer
    tok_lists = [nltk.word_tokenize(sent) for sent in sent_list]


    ### filter for the shortest words. Here this is words equal to a lenght of 1
    tok_lists = [w for w in tok_lists if len(w) >=1]


    # Build the W2V model
    ## The value of min_count does so that only words that appears at least the value are included
    
    # w2v model based on word lists
    word2vec_tokens = Word2Vec(tok_lists, min_count=4)
    
    return word2vec_tokens

# Build w2v models

We build three different models:
- one model from the novels of Balzac
- one model from the novels of Zola
- one model from all the novels

In [19]:
# build w2v models

def apply_w2v(data_series):
    startTime = time.time()
    word2vec_tokens = build_w2v_model(' '.join(data_series))
    executionTime = (time.time() - startTime)
    print('Building time in sec.: ' + str(executionTime))
    return word2vec_tokens

balsac = df[df['author'] == 'Balzac']['clean_text']
zola = df[df['author'] == 'Zola']['clean_text']
all_text = df['clean_text']

# Save the balzac model for later use                                      
balzac_model = apply_w2v(balsac)                                  
balzac_model.save('balzac_model.model')

# Save the zola model for later use                                      
zola_model = apply_w2v(zola)
zola_model.save('zola_model.model')
                                      
# Save the total model for later use                                      
all_text_model = apply_w2v(all_text)
all_text_model.save('all_text_model.model')

Building time in sec.: 1.8003251552581787
Building time in sec.: 8.346299886703491
Building time in sec.: 26.394368171691895


# Reduce the language model for the purpose of visualisation.

In [20]:
# Load the model

# load models
balsac_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\balzac_model.model')
zola_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\zola_model.model')
all_text_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\all_text_model.model')

In [22]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [23]:
x_vals, y_vals, labels = reduce_dimensions(balzac_model)
# Send the x, y, and label values to a dataframe and save it for later use
df = pd.DataFrame({'x_vals':x_vals, 'y_vals':y_vals, 'labels':labels})
df.to_csv('balzac_model_values_labels_2D.csv', index=False)

x_vals, y_vals, labels = reduce_dimensions(zola_model)
# Send the x, y, and label values to a dataframe and save it for later use
df = pd.DataFrame({'x_vals':x_vals, 'y_vals':y_vals, 'labels':labels})
df.to_csv('zola_model_values_labels_2D.csv', index=False)

x_vals, y_vals, labels = reduce_dimensions(all_text_model)
# Send the x, y, and label values to a dataframe and save it for later use
df = pd.DataFrame({'x_vals':x_vals, 'y_vals':y_vals, 'labels':labels})
df.to_csv('all_text_model_values_labels_2D.csv', index=False)

# Load the Gensim model and make an analyse of word similarity

We make an analyse of the similarity of the keywords within each of the language models to capture semantic relationships between keywords and other words.

In [35]:
# load the Libraries 
import nltk
from gensim.models import Word2Vec

In [5]:
os.getcwd()

'C:\\Users\\lakj\\Documents\\GitHub\\Lighting in French Literature\\code'

In [11]:
# load models
balsac_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\balzac_model.model')
zola_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\zola_model.model')
all_text_model = Word2Vec.load(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\all_text_model.model')

# read keyword lists
input_dir = Path.cwd() / '../data/key_word_lists' # input directory
with open(input_dir / 'sensation_list.txt', 'r', encoding='utf-8-sig') as f:
    emo_keywords = f.read()
emo_keyword_list = emo_keywords.lower().split('\n')
emo_keyword_list = [i.strip() for i in emo_keyword_list]
emo_keyword_list = list(set(emo_keyword_list))
emo_keyword_list.sort()

input_dir = Path.cwd() / '../data/key_word_lists' # input directory
with open(input_dir / 'lightning_list.txt', 'r', encoding='utf-8-sig') as f:
    lightning_keywords = f.read()
lightning_keyword_list = lightning_keywords.lower().split('\n')
lightning_keyword_list = [i.strip() for i in lightning_keyword_list]
lightning_keyword_list = list(set(lightning_keyword_list))
lightning_keyword_list.sort()


# Function that takes a model, a list of keywords, and returns the five most similar word for each keyword

def multiple_similar_words(model, word_list, no = 5):
    
    for term in word_list:
        try:

            print ('Keyword:', term, '\n')
            print ('Most similar words:\n', model.wv.most_similar(term, topn=no))
            print ('\n')

        except KeyError:

            print ("Term or frase is not present in the model")
            print ('\n')
            

#############            
# run the function

balsac_sim_emo_keywords = multiple_similar_words(balsac_model, emo_keyword_list, no = 5)
print (f'Keyword similarity in Balsac corpus {balsac_sim_emo_keywords}')

zola_sim_emo_keywords = multiple_similar_words(zola_model, emo_keyword_list, no = 5)
print (f'Similar words in Zola corpus {zola_sim_emo_keywords}')

all_text_sim_emo_keywords = multiple_similar_words(all_text_model, emo_keyword_list, no = 5)
print (f'Keyword similarity in all text corpus {all_text_sim_emo_keywords}')



Keyword: agonie 

Most similar words:
 [('condamné', 0.37197813391685486), ('croirait', 0.368137389421463), ('fit-elle', 0.36025068163871765), ('fondé', 0.3441506028175354), ('périr', 0.3369585871696472)]


Keyword: attendri 

Most similar words:
 [('humain', 0.6869267821311951), ('voulaient', 0.6630045771598816), ('plusieurs', 0.6466014385223389), ('premières', 0.6461528539657593), ('mots', 0.642706573009491)]


Keyword: bouleverser 

Term or frase is not present in the model


Keyword: caressant 

Most similar words:
 [('rosaces', 0.8065230846405029), ('argent', 0.7998543977737427), ('raphaël', 0.7952196002006531), ('vulgaires', 0.7886896729469299), ('seule', 0.787570595741272)]


Keyword: caresse 

Term or frase is not present in the model


Keyword: chagrin 

Most similar words:
 [('rubempré', 0.8779054284095764), ('maison', 0.8746100664138794), ('m', 0.8734118342399597), ('papier', 0.8731304407119751), ('eut', 0.8725594878196716)]


Keyword: confus 

Most similar words:
 [('compag

In [13]:
##############
# If you will look up a single word then 
# replace the text string after the variable 'term'
# and adjust the integer after the variable no to get x amount of simiar words 
#############
print (10 * '*-*')
term = 'lumière'
no = 5
try:
    print ('french_literature_model:', term, '\n')
    print (all_text_model.wv.most_similar(term, topn=no))
    print ('\n')

except KeyError:

    print ("Term or frase is not present in the model")
    print ('\n')

*-**-**-**-**-**-**-**-**-**-*
french_literature_model: lumière 

[('chambre', 0.9978148341178894), ('front', 0.9973626136779785), ('vieille', 0.9971561431884766), ('rue', 0.9971288442611694), ('célèbre', 0.9969614148139954)]




# Visual analysis of the language models values and labels

Below I will use the language model that I reduced and saved for the purpose of visualisation.

I need to do a lot of scripting to get to the visualisation. 


The goal is a graph that I can use for a visual analysis of the relation between keywords of two different themes.


In [16]:
# Load the reducered model in a dataframe
file = r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\w2v_models\all_text_model_values_labels_2D.csv'
df = pd.read_csv(file)


# Open two word lists
# 1
of = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\key_word_lists\sensation_list.txt', 'r', encoding='utf-8-sig')
sensation_word_list = of.read().lower().split('\n')
sensation_word_list = [i.strip() for i in sensation_word_list]
sensation_word_list = list(set(sensation_word_list))
sensation_word_list.sort()
of.close()

#
of = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\key_word_lists\technology_list.txt', 'r', encoding='utf-8-sig')
technology_list = of.read().lower().split('\n')
technology_list = [i.strip() for i in technology_list]
technology_list = list(set(technology_list))
technology_list.sort()
of.close()


In [17]:
# Function that take a keyword and returns x and y positions and the word if the word is in the model
def get_label_values(word):
    for index, row in df.iterrows():
        if row['labels'] == word:
            return index, row['x_vals'], row['y_vals'], row['labels']
        

# Function that take an list of keywords and returns both the word and its x and y values       
def get_words_labels(key_word_list): 
    values_labels = [get_label_values(word) for word in key_word_list]
    values_labels_list = [i for i in values_labels if type(i) != 'NoneType']
    return values_labels_list


# Build two lists of keywords
values_labels_list1 = get_words_labels(sensation_word_list)
values_labels_list2 = get_words_labels(technology_list)

In [18]:
# Function that takes a values_labels_list and a theme word that represents the theme of the values_labels_list
# the function returns a dataframe with x and y values and a column with the theme word.

def find_x_y_values(values_labels_list, theme = 'missing'):
    index = []
    x_vals = []
    y_vals = []

    for i in values_labels_list:
        try:
            index.append(i[0])
            x_vals.append(i[1])
            y_vals.append(i[2])
        except:
            i == 'Nonetype'
            pass

    values_labels_df = pd.DataFrame({'x_vals': x_vals, 'y_vals': y_vals, 'themes':theme}, index=index)
    
    # https://www.statology.org/pandas-merge-on-index/
    merges_df = pd.merge(df, values_labels_df, left_index=True, right_index=True)
    merges_df = merges_df[['x_vals_x', 'y_vals_x', 'labels', 'themes']].rename(columns={'x_vals_x': 'x_vals', 'y_vals_x': 'y_vals'} )
    
    
    return merges_df  

I use the function to store a dataframe that holds x values, y values, labels (keywords), and the theme that the keywords relates to.


I merge the two dataframe and store them together in the variable 'merged_df'.

In [23]:
find_x_y_values_list1 =  find_x_y_values(values_labels_list1, 'sensations')
find_x_y_values_list2 =  find_x_y_values(values_labels_list2, 'technology')

In [24]:
merged_df = pd.concat([find_x_y_values_list1,find_x_y_values_list2])

In [25]:
# Scatter plot documentation: https://plotly.com/python-api-reference/generated/plotly.express.scatter.html

import plotly.express as px
fig = px.scatter(merged_df, x = 'x_vals', y = 'y_vals', color='themes', hover_data=['labels', 'themes'], 
                 title='Visual analysis of the relation between keywords in two themes')
fig.show()

Save the graph as a html file for later use.

In [26]:
import plotly.io as io

html_snippet_start = '<!DOCTYPE html> <html> <head> <title>Title</title> </head> <body>' 
html_snippet_end = ' </body></html> '

html_as_string = io.to_html(fig, full_html=False)

vis_in_html = html_snippet_start + html_as_string + html_snippet_end

of = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\visualisations\sens_and_tech_gensim_2d.htm', 'w', encoding='utf-8-sig')
of.write(vis_in_html)

3694041

# Sentiment analysis of sentences holding two keywords from different themes

with combinations of emotions and technology or ligthning words

In [36]:
import pandas as pd
import os
from afinn import Afinn
import re

In [37]:
os.getcwd()

'C:\\Users\\lakj\\Documents\\GitHub\\Lighting in French Literature\\code'

In [38]:
input_dir = Path.cwd() / '../data/csv_files' # input directory
df = pd.read_csv(input_dir /'text_data230826.csv', sep='|')

In [39]:
df

Unnamed: 0,year,author,title,text,clean_text
0,1838,Balzac,Splendeurs et misères des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,splendeurs et misèresdes courtisanes laissez-m...
1,1838,Balzac,Splendeurs et misères des courtisanes 2 A comb...,"Depuis huit jours, Nucingen allait marchander ...",depuis huit jours nucingen allait marchander l...
2,1838,Balzac,Splendeurs et misères des courtisanes 3 Où mèn...,"Le lendemain, à six heures, deux voitures mené...",le lendemain à six heures deux voitures menées...
3,1838,Balzac,Splendeurs et misères des courtisanes 4 La der...,"— Qu’y a-t-il, Madeleine? dit madame Camusot e...",qu’y a-t-il madeleine dit madame camusot en vo...
4,1842,Balzac,La Rabouilleuse,"LES CÉLIBATAIRES. Voici, mon cher Nodier, un o...",les célibataires voici mon cher nodier un ouvr...
...,...,...,...,...,...
96,1910,Colette,La Vagabonde,LA VAGABONDE \n\n\n\nParu dans Le Livre de Poc...,la vagabonde paru dans le livre de poche l'ing...
97,1910,DelarueMardrus,Comme tout le monde,Première partie C’est un wagon de deuxième...,première partie c’est un wagon de deuxième cla...
98,1910,Maupassant,Les Dimanches d'un bourgeois à Paris,"Monsieur Patissot, né à Paris, après avoir fai...",monsieur patissot né à paris après avoir fait ...
99,1911,Daudet,Rose et Ninette,"au poëte et au philosophe, je dédie cette page...",au poëte et au philosophe je dédie cette page ...


In [40]:
# text string input
work = 'La Cousine Bette'
text = df[df['title'] == work].iloc[0]['text'].lower()



# split the text in sentences
def sent_tokenizer(text):
    tokenized_sentences = re.split('\.', text)
    tokenized_sentences = [s.lstrip() for s in tokenized_sentences]
    return tokenized_sentences

sentence_list = sent_tokenizer(text)


# Open two word lists
# 1
of = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\key_word_lists\sensation_list.txt', 'r', encoding='utf-8-sig')
sensation_word_list = of.read().lower().split('\n')
sensation_word_list = [i.strip() for i in sensation_word_list]
sensation_word_list = list(set(sensation_word_list))
sensation_word_list.sort()
of.close()

#
of = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\key_word_lists\technology_list.txt', 'r', encoding='utf-8-sig')
technology_list = of.read().lower().split('\n')
technology_list = [i.strip() for i in technology_list]
technology_list = list(set(technology_list))
technology_list.sort()
of.close()



#####################
# add two word lists and run to get a sentiment score
####################
def sentiment_analysis(word_list1 = sensation_word_list, word_list2 = technology_list):
    
    
    from_word_list_one = []
    from_word_list_two = []
    sentiment_scores = []
    sentences = []
    
    for word1 in word_list1:
        for sent in sentence_list:
            if word1 in sent:
                for word2 in word_list2:
                    if word2 in sent:
                        from_word_list_one.append(word1)
                        from_word_list_two.append(word2)
                        afinn = Afinn()
                        sent_score = afinn.score(str(sent))
                        sentiment_scores.append(sent_score)
                        sentences.append(sent)
    
    senti_dataframe = pd.DataFrame({'word1': from_word_list_one, 'word2': from_word_list_two, 
                                        'sentiment_score': sentiment_scores, 'sentence': sentences})
    
    return senti_dataframe

# add positive, negative, or neutral cetegory
def apply_sentiment_cat(row):
    if row < 0:
        return 'neg'
    elif row == 0:
        return 'neu'
    elif row > 0:
        return 'pos'
                
sentiment_dataframe = sentiment_analysis(sensation_word_list, technology_list)
sentiment_dataframe['sentiment_cat'] = sentiment_dataframe['sentiment_score'].apply(lambda x : apply_sentiment_cat(x) )

sentiment_dataframe

Unnamed: 0,word1,word2,sentiment_score,sentence,sentiment_cat
0,chagrin,feu,0.0,"—oui, je n’ai pas un sou dans ce moment à donn...",neu
1,confus,feu,-4.0,la rougeur subite qui colora sa fille rendit l...,neg
2,doucement,lustre,6.0,"—et vous croyez, ma petite mère, dit crevel, q...",pos
3,doucement,feu,0.0,— hulot est le premier exemple de l’amour quan...,neu
4,désir,gaz,3.0,"à quoi bon des bas de soie gris tout neufs, de...",pos
5,heureux,feu,0.0,"—oui, je n’ai pas un sou dans ce moment à donn...",neu
6,heureux,feu,5.0,"le visage heureux et souriant dans le miroir, ...",pos
7,las,feu,-4.0,"—des gens qui veulent mettre l’europe en feu, ...",neg
8,las,lustre,-4.0,"dix jours après ces événements, on publia le p...",neg
9,sentiment,lanterne,-1.0,conseillée par la redoutable intelligence de l...,neg


In [43]:
sentiment_dataframe [sentiment_dataframe['word2'] == 'gaz']

Unnamed: 0,word1,word2,sentiment_score,sentence,sentiment_cat
4,désir,gaz,3.0,"à quoi bon des bas de soie gris tout neufs, de...",pos


Do we get mostly positive, negative, or neutral sentenses from the combination of words describing sensations and technology?

In [44]:
sentiment_dataframe['sentiment_cat'].value_counts()

sentiment_cat
neg    8
neu    4
pos    3
Name: count, dtype: int64

The keywords in the positive sentences. 

In [54]:
pos_sentiment = sentiment_dataframe[sentiment_dataframe['sentiment_cat'] == 'pos']

The table of the sensation keyword in the sentiments sorted by frequency. 

In [55]:
pos_sensation_words = pos_sentiment['word1'].value_counts()
pos_sensation_words

word1
doucement    1
désir        1
heureux      1
Name: count, dtype: int64

The table of the technology keywords in the sentiments sorted by frequency.

In [56]:
pos_tech_words = pos_sentiment['word2'].value_counts()
pos_tech_words

word2
lustre    1
gaz       1
feu       1
Name: count, dtype: int64

The keywords in the negative sentences. 

In [57]:
neg_sentiment = sentiment_dataframe[sentiment_dataframe['sentiment_cat'] == 'neg']

In [58]:
neg_sen_words = neg_sentiment['word1'].value_counts()
neg_sen_words

word1
las          2
sentiment    2
vertige      2
confus       1
voix         1
Name: count, dtype: int64

In [59]:
neg_tech_words = neg_sentiment['word2'].value_counts()
neg_tech_words

word2
tige        3
feu         2
lustre      1
lanterne    1
lumière     1
Name: count, dtype: int64

A comparision. Which emo words are only in positive sentences? 

In [62]:
sen_words_compare = [w for w in list(pos_sensation_words.index) if w not in list(neg_sen_words.index)]
sen_words_compare

['doucement', 'désir', 'heureux']

A comparision. Which emo words are only in negative sentences? 

In [64]:
sen_words_compare = [w for w in list(neg_sen_words.index) if w not in list(pos_sensation_words.index)]
sen_words_compare

['las', 'sentiment', 'vertige', 'confus', 'voix']

# Parts of speech tagging with Spacy

Source: https://spacy.io/models/fr

In [66]:
import spacy
# Load the spacy model
nlp = spacy.load('fr_core_news_md')


#########
# single text string input
########
work = 'La Cousine Bette'
text = df[df['title'] == work].iloc[0]['clean_text']


# pos tagging
doc = nlp(text)

In [68]:
for token in doc:
    if 'VERB' in token.pos_:
        print(token.text, token.pos_, token.dep_)

mises VERB acl
nommées VERB conj
cheminait VERB acl
portant VERB acl
accusés VERB acl
trouve VERB advcl
croient VERB ccomp
uniforme VERB xcomp
supposent VERB conj
imaginer VERB advcl
appartenant VERB acl
respirait VERB ccomp
faisait VERB acl:relcl
resplendir VERB xcomp
acquise VERB acl
met VERB advcl
retirés VERB acl
devinait VERB ccomp
croyez VERB punct
manquait VERB ccomp
bombée VERB acl
campé VERB acl
décoré VERB acl
laissait VERB appos
errer VERB xcomp
recueillent VERB acl:relcl
adressés VERB acl
arrêta VERB acl
comprise VERB acl
bâtie VERB acl
respecté VERB conj
demeurait VERB acl:relcl
diminuée VERB acl
accepta VERB acl
descendre VERB advcl
reconnu VERB acl:relcl
a VERB advcl
a VERB ccomp
remit VERB advcl
demander VERB xcomp
dirigea VERB advcl
disait VERB acl:relcl
ont VERB parataxis
arrêtent VERB parataxis
décorés VERB acl
connaissent VERB parataxis
occupé VERB advcl
nommé VERB conj
distinguer VERB advcl
créé VERB acl:relcl
chargé VERB acl
prendre VERB xcomp
placé VERB acl
obtin

In [69]:
for token in doc:
    if token.pos_ == 'NOUN':
        print(token.text, token.pos_, token.dep_)

milieu NOUN obl:mod
mois NOUN nmod
juillet NOUN nmod
année NOUN nmod
voitures NOUN nmod
circulation NOUN obl:arg
places NOUN nmod
milords NOUN obj
rue NOUN obj
université NOUN nmod
homme NOUN obj
taille NOUN nmod
uniforme NOUN nmod
capitaine NOUN nmod
garde NOUN nmod
nombre NOUN nmod
parisiens NOUN nmod
habits NOUN obl:mod
femmes NOUN obl:arg
goûts NOUN nmod
impressionnées NOUN advcl
aspect NOUN obl:mod
bonnet NOUN nmod
poil NOUN nmod
harnais NOUN conj
physionomie NOUN nsubj
capitaine NOUN nmod
légion NOUN obl:arg
contentement NOUN obj
teint NOUN obj
figure NOUN conj
auréole NOUN obl:mod
richesse NOUN nsubj
commerce NOUN obl:mod
front NOUN obl:arg
boutiquiers NOUN nmod
élus NOUN nmod
adjoint NOUN obj
arrondissement NOUN nmod
ruban NOUN nsubj
légion NOUN nmod
honneur NOUN nmod
poitrine NOUN obl:arg
prussienne NOUN obl:arg
coin NOUN obl:mod
milord NOUN nmod
homme NOUN nmod
regard NOUN obj
passants NOUN obl:arg
sourires NOUN nsubj
yeux NOUN obl:mod
milord NOUN nmod
partie NOUN obl:mod
rue

feu NOUN obj
plaisir NOUN obj
cure NOUN nmod
jouissance NOUN obj
famille NOUN obj
horreurs NOUN nmod
faim NOUN nmod
dépravation NOUN nmod
misère NOUN nmod
travail NOUN obl:arg
vie NOUN nmod
magistrat NOUN obj
commissaire NOUN appos
police NOUN nmod
vie NOUN obj
combinaisons NOUN obj
intérêt NOUN obl:arg
monstre NOUN nsubj
regret NOUN obj
moitié NOUN nsubj
société NOUN nmod
vie NOUN obj
ami NOUN obl:mod
- NOUN obl:mod
temps NOUN obl:mod
ans NOUN obl:mod
notaires NOUN conj
clients NOUN obl:arg
adversaires NOUN nsubj
clients NOUN nmod
monsieur NOUN nmod
fils NOUN nmod
avocat NOUN nsubj:pass
défense NOUN obj
souriant NOUN xcomp
victorin NOUN obj
mal NOUN advmod
baronne NOUN obj
manque NOUN nmod
religion NOUN nmod
médecin NOUN obj
envahissement NOUN conj
finance NOUN nmod
chose NOUN nsubj
égoïsme NOUN nsubj
argent NOUN nsubj
supériorités NOUN obj
noblesse NOUN obj
talent NOUN obj
services NOUN nmod
état NOUN obl:arg
loi NOUN nsubj
argent NOUN obl:arg
étalon NOUN nmod
base NOUN obl:mod
capac

In [70]:
# See Keyword in context 
import re
context = re.findall(r'.{0,50}\blampe.{0,80}|.{0,50}\blustre.{0,80}', text)
context

for sent in context:
    doc2 = nlp(sent)
    for token in doc2:
        if 'VERB' in token.pos_:
            print(token.text, token.pos_, token.dep_)
        elif token.pos_ == 'NOUN':
            print(token.text, token.pos_, token.dep_)
    print ('*'*15)

madame NOUN nsubj
dit VERB ROOT
montrait VERB conj
lampe NOUN obj
lustre NOUN nmod
dédoré VERB acl
cordes NOUN nmod
tapis NOUN nmod
haillons NOUN dep
opulence NOUN nmod
faisa VERB acl:relcl
***************
plâtre NOUN nmod
jouant VERB acl
bronze NOUN obj
lustre NOUN obj
mis VERB xcomp
couleur NOUN obl:arg
bobèches NOUN obl:arg
cristal NOUN obl:mod
fondu VERB ROOT
tapis NOUN obj
***************
trouva VERB ROOT
travaillant VERB xcomp
lueur NOUN obl:arg
lampe NOUN nmod
clarté NOUN nsubj
augmentait VERB acl:relcl
passant VERB advcl
travers NOUN fixed
globe NOUN obl:arg
eau NOUN nmod
héros NOUN nmod
***************
aidé VERB ROOT
femme NOUN obj
nettoyer VERB xcomp
meubles NOUN obj
rendre VERB acl
lustre NOUN obl:arg
objets NOUN obl:mod
savonnant NOUN acl
brossant VERB advcl
époussetant VERB advcl
***************
dustrie NOUN obl:mod
flambeaux NOUN nmod
bras NOUN nmod
lustre NOUN nsubj
pendule NOUN nmod
appartenaient VERB ROOT
genre NOUN obl:arg
table NOUN obj
milieu NOUN nmod
*************

# Keyword in context and N-grams

Source: https://programminghistorian.org/en/lessons/keywords-in-context-using-n-grams#from-text-to-n-grams

In [74]:
# Given a list of words and a number n, return a list
# of n-grams.

def getNGrams(wordlist, n):
    return [wordlist[i:i+n] for i in range(len(wordlist)-(n-1))]

stopwords = open(r'C:\Users\lakj\Documents\GitHub\Lighting in French Literature\data\stopwords\fr_stopwords.txt','r', encoding='utf-8-sig').read().split('\n')
text = ' '.join(df[df['author'] == 'Balzac']['clean_text'])
wordlist = re.findall(r'\b\S+\b', text)
wordlist_filtered = [w for w in wordlist if w not in stopwords]

bigrams = getNGrams(wordlist_filtered, 3)

words = ['lumière', 'clarté']

bi_grams_filtered = []
for i in words:
    bi_grams  = [w for w in bigrams if i in w]
    bi_grams_filtered.append(bi_grams)
    
    
bi_grams_filtered = [x for y in bi_grams_filtered for x in y]
bi_grams_filtered

[['cabaret', 'malpropre', 'lumière'],
 ['malpropre', 'lumière', 'boutique'],
 ['lumière', 'boutique', 'lingère'],
 ['matérielles', 'l’air', 'lumière'],
 ['l’air', 'lumière', 'locaux'],
 ['lumière', 'locaux', 'moraliste'],
 ['attestaient', 'force', 'lumière'],
 ['force', 'lumière', 'entrait'],
 ['lumière', 'entrait', 'fois'],
 ['noir', 'rouge', 'lumière'],
 ['rouge', 'lumière', 'buste'],
 ['lumière', 'buste', 'd’athlète'],
 ['hasard', 'rayonne', 'lumière'],
 ['rayonne', 'lumière', 'divine'],
 ['lumière', 'divine', 'venu'],
 ['racines', 'perdues', 'lumière'],
 ['perdues', 'lumière', 'glissant'],
 ['lumière', 'glissant', 'sillon'],
 ['naissais', 'vie', 'lumière'],
 ['vie', 'lumière', 'milieu'],
 ['lumière', 'milieu', 'acclamations'],
 ['fut', 'illuminé', 'lumière'],
 ['illuminé', 'lumière', 'intérieure'],
 ['lumière', 'intérieure', 'voyant'],
 ['peyrade', 'mit', 'lumière'],
 ['mit', 'lumière', 'fenêtre'],
 ['lumière', 'fenêtre', 'chose'],
 ['obtenir', 'moindre', 'lumière'],
 ['moindre', '