## Import

In [None]:
import pandas as pd
import spacy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import igraph as ig
import nltk
import pickle
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from scipy.stats import spearmanr, ks_2samp
from sklearn.metrics.pairwise import cosine_similarity
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
from wordcloud import WordCloud
from stop_words import get_stop_words

In [None]:
from statsmodels.stats.proportion import proportions_ztest

## Config

In [None]:
dataframe_input_path = "data/texts.p"
sentiment_dir = "data/sentiment/" # use / at the end

In [None]:
tqdm.pandas()

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc('ps',fonttype = 42)
plt.rc('pdf',fonttype = 42)
plt.rcParams.update({'font.size': 20})
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['axes.unicode_minus'] = False

In [None]:
languages = ["French", "German", "Italian", "Portuguese", "Spanish"]

## Load data

### Texts

In [None]:
texts_df = pd.read_pickle(dataframe_input_path)

### Spacy NLP

In [None]:
de_nlp = sc.load("de_core_news_sm")
fr_nlp = sc.load("fr_core_news_sm")
es_nlp = sc.load("es_core_news_sm")
it_nlp = sc.load("it_core_news_sm")
pt_nlp = sc.load("pt_core_news_sm")

nlp_to_use = {
    "French": fr_nlp,
    "German": de_nlp,
    "Italian": it_nlp,
    "Portuguese": pt_nlp,
    "Spanish": es_nlp
}

### Sentiment dictionaries

In [None]:
sentiment_lexica = {}
for lang in languages:
    sentiment_lexica[lang] = {}
    with open("{}negative_words_{}.txt".format(sentiment_dir, lang.lower()), "r") as fr:
        sentiment_lexica[lang]["neg"] = fr.read().splitlines()
    with open("{}positive_words_{}.txt".format(sentiment_dir, lang.lower()), "r") as fr:
        sentiment_lexica[lang]["pos"] = fr.read().splitlines()

## Prepare data

### Fix authors

In [None]:
author_fix = {
    "Bachiller D. P. Gatell": "Bachiller D. P. Gatell.",
    "Eliza Haywood": "Eliza Fowler Haywood",
}
texts_df["author"] = texts_df["author"].replace(author_fix)

### Fix language

In [None]:
texts_df["language"] = texts_df["language"].replace("Spanish; Castilian", "Spanish")

### Fix years

In [None]:
texts_df["date"] = texts_df["date"].apply(lambda x: x.split("-")[0])
texts_df["date"] = texts_df["date"].apply(lambda x: x.split(" [")[0])
texts_df["date"] = texts_df["date"].apply(lambda x: x.split(" bzw.")[0])

### Reduce to defined languages

In [None]:
texts_df = texts_df[texts_df["language"].isin(languages)]

### Calculate Sentiment

In [None]:
def analyze_sentiment(text, nl, pl):
    tokens = nltk.word_tokenize(text)
    num_negative = 0
    num_positive = 0
    for nw in nl:
        num_negative += tokens.count(nw.lower())
    for pw in pl:
        num_positive += tokens.count(pw.lower())
    try:
        score = (num_positive - num_negative) / (num_positive + num_negative)
    except ZeroDivisionError:
        score = 0
    return score

In [None]:
texts_df["sentiment"] = 0
for language in languages:
    lang_df = texts_df.loc[texts_df["language"] == language]
    neg_lexicon = sentiment_lexica[language]["neg"]
    pos_lexicon = sentiment_lexica[language]["pos"]
    scores = lang_df["text"].progress_apply(analyze_sentiment, args=[neg_lexicon, pos_lexicon])
    texts_df["sentiment"].update(scores)

## Dataset statistics

In [None]:
for language in languages:
    lang_df = texts_df.loc[texts_df["language"] == language]
    journal_group = lang_df.groupby("filename")
    authors = lang_df["author"].unique()
    num_authors = len(authors)
    if "Anonym" in authors:
        num_authors -= 1
        num_anonymus = journal_group.apply(lambda x: 1 if all(x["author"] == "Anonym") else 0).sum()
    else:
        num_anonymus = 0
    topics = lang_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
    years = lang_df["date"].unique()
        
    print(language)
    print("num authors:",  num_authors)
    print("num_anonymous:", num_anonymus)
    print("num journals:", len(journal_group))
    print("num text passages:",  lang_df.shape[0])
    print("num topics:",  len(np.unique(topics)))
    print("years:", np.min(years), np.max(years))
    print()

## Text Sentiment

### Years

In [None]:
for language in languages:
    language_df = texts_df[texts_df["language"] == language]
    #print(language_df)
    fig, ax = plt.subplots(figsize=(10,2.5))
    sns.lineplot(data=language_df, x="date", y="sentiment", ax=ax)
    plt.draw()
    ax.set_xlabel("Years")
    ax.set_ylabel("Mean Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

### Narrative forms

In [None]:
for language in languages:
    language_df = texts_df[texts_df["language"] == language].copy()
    
    # standardize
    language_df["sentiment"] = language_df["sentiment"] - language_df["sentiment"].mean()
    language_df["sentiment"] = language_df["sentiment"] / language_df["sentiment"].std()
    fig, ax = plt.subplots(figsize=(10,5))
    sns.pointplot(data=language_df, x="ndf", y="sentiment", ax=ax, marker="s", join=False)
    plt.draw()
    ax.set_xlabel("Narrative Form")
    ax.set_ylabel("Mean Standardized Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

### Topics

In [None]:
topics = texts_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
t_s_df = pd.merge(topics, texts_df[["sentiment", "language"]], left_index=True, right_index=True)

In [None]:
for language in languages:
    language_df = t_s_df[t_s_df["language"] == language].copy()
    language_df["sentiment"] = language_df["sentiment"] - language_df["sentiment"].mean()
    language_df["sentiment"] = language_df["sentiment"] / language_df["sentiment"].std()
    fig, ax = plt.subplots(figsize=(10,10))
    sns.pointplot(data=language_df, x="value", y="sentiment", ax=ax, marker="s", join=False)
    plt.draw()
    ax.set_xlabel("Topic")
    ax.set_ylabel("Mean Standardized Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

## Sentiment Networks

## Sentiment Word Networks

### Lemmatize

In [None]:
def lemmatize(row):
    lang = row["language"]
    if lang not in nlp_to_use.keys():
        return ""
    doc = nlp_to_use[lang](row["text"])
    tokens = []
    for t in doc:
        tokens.append(t.lemma_)
    return " ".join(tokens)

texts_df["tokens"] = texts_df.progress_apply(lemmatize, axis=1)

### Create graphs

In [None]:
occurrences = {}
num_random_runs = 1000
for language in languages:
    language_df = texts_df[texts_df["language"] == language]
    stop_words = get_stop_words(language.lower())
    vectorizer = CountVectorizer(max_df=0.8, stop_words=stop_words)
    frequencies = vectorizer.fit_transform(language_df["tokens"]).T
    frequencies_df = pd.DataFrame(frequencies.todense(), index=vectorizer.get_feature_names())
    frequencies_df = frequencies_df[frequencies_df.index.isin(list(set(sentiment_lexica[language]["neg"]) | set(sentiment_lexica[language]["pos"])))]
    similarity_df = pd.DataFrame(cosine_similarity(frequencies_df), index=frequencies_df.index, columns=frequencies_df.index)
    pairwise_df = similarity_df.where(np.triu(np.ones(similarity_df.shape)).astype(np.bool)).stack().reset_index()
    pairwise_df.columns = ["source", "target", "similarity"]
    pairwise_df["similarity"] = pairwise_df["similarity"].round(5)
    p_values_df = pd.DataFrame()
    p_values_df["source"] = pairwise_df["source"]
    p_values_df["target"] = pairwise_df["target"]
    p_values_df["p_value"] = 0
    for rr in tqdm(range(num_random_runs)):
        random_frequencies_df = frequencies_df.sample(frac=1)
        random_frequencies_df.index = frequencies_df.index
        random_similarity_df = pd.DataFrame(cosine_similarity(random_frequencies_df), index=frequencies_df.index, columns=frequencies_df.index)
        random_pairwise_df = random_similarity_df.where(np.triu(np.ones(random_similarity_df.shape)).astype(np.bool)).stack().reset_index()
        random_pairwise_df.columns = ["source", "target", "similarity"]
        random_pairwise_df["similarity"] = random_pairwise_df["similarity"].round(5)
        p_values_df.loc[pairwise_df[random_pairwise_df["similarity"] >= pairwise_df["similarity"]].index, "p_value"] += 1
    p_values_df["p_value"] = (p_values_df["p_value"] + 1) / (num_random_runs + 1)
    occurrences[language] = p_values_df

In [None]:
graphs = {}
for language in languages:
    graph = ig.Graph(directed=False)
    significant_pair_df = occurrences[language][occurrences[language]["p_value"] < 0.01]
    vertices = list(set(significant_pair_df["source"]) | set(significant_pair_df["target"]))
    for vertex in vertices:
        if vertex in sentiment_lexica[language]["neg"]:
            graph.add_vertex(vertex, sent=1, color="red", sentiment="negative")
        if vertex in sentiment_lexica[language]["pos"]:
            graph.add_vertex(vertex, sent=2, color="green", sentiment="positive")
    for idx, row in significant_pair_df.iterrows():
        graph.add_edge(row["source"], row["target"], p_value=row["p_value"])
    
    components = graph.clusters()
    print(language)
    for c in components:
        print(len(c), end=" ")
    print()
    
    graphs[language] = components.giant()

### Save graphs

In [None]:
with open("results/graphs.p", "wb") as handle:
    pickle.dump(graphs, handle)

### Load graphs

In [None]:
with open("results/graphs.p", "rb") as handle:
    graphs = pickle.load(handle)

### Graph statistics

In [None]:
for language in languages:
    num_nodes = len(graphs[language].vs)
    num_positive_nodes = len(np.where(np.array(graphs[language].vs["sentiment"]) == "positive")[0])
    num_negative_nodes = len(np.where(np.array(graphs[language].vs["sentiment"]) == "negative")[0])
    print(language)
    print("num nodes:", num_nodes)
    print("num positive nodes:", num_positive_nodes, round(num_positive_nodes/num_nodes, 2))
    print("num negative nodes:", num_negative_nodes, round(num_negative_nodes/num_nodes, 2))
    print("num edges:", len(graphs[language].es))
    print()

### Plot French graph

In [None]:
visual_style = {}
visual_style["vertex_size"] = 20
visual_style["vertex_label"] = graphs["French"].vs["name"]
visual_style["vertex_label_size"] = 15
visual_style["layout"] = graphs["French"].layout_fruchterman_reingold()#layout_kamada_kawai()
visual_style["bbox"] = (1400, 1000)
visual_style["margin"] = 50
ig.plot(graphs["French"], "results/plots/sentiment_word_network_french.pdf", **visual_style)
plt.close()

### Plot German graph

In [None]:
visual_style = {}
visual_style["vertex_size"] = 20
visual_style["vertex_label"] = graphs["German"].vs["name"]
visual_style["vertex_label_size"] = 15
visual_style["layout"] = graphs["German"].layout_fruchterman_reingold(niter=100000)#layout_kamada_kawai()
visual_style["bbox"] = (1400, 1000)
visual_style["margin"] = 50
ig.plot(graphs["German"], "results/plots/sentiment_word_network_german.pdf", **visual_style)
plt.close()

### Plot Portuguese graph

In [None]:
visual_style = {}
visual_style["vertex_size"] = 20
visual_style["vertex_label"] = graphs["Portuguese"].vs["name"]
visual_style["vertex_label_size"] = 15
visual_style["layout"] = graphs["Portuguese"].layout_fruchterman_reingold(niter=100000)#layout_kamada_kawai()
visual_style["bbox"] = (1400, 1000)
visual_style["margin"] = 50
ig.plot(graphs["Portuguese"], "results/plots/sentiment_word_network_portuguese.pdf", **visual_style)
plt.close()

### Most significant edges

In [None]:
top_sig_list = []
for language in languages:
    edges_list = []
    for e in graphs[language].es:
        edges_list.append({"source": graphs[language].vs[e.source]["name"], "target": graphs[language].vs[e.target]["name"], "p_value": e["p_value"]})
    edges_df = pd.DataFrame(edges_list)
    top_sig_s = edges_df.sort_values("p_value").head(10).reset_index().apply(lambda x: "{} - {}".format(x["source"], x["target"]), axis=1)
    top_sig_s.name = language
    top_sig_list.append(top_sig_s)
top_sig_df = pd.concat(top_sig_list, axis=1)
print(top_sig_df.to_latex(index=False))

### Calculate metrics

In [None]:
degree_results = {}
centrality_results_df = pd.DataFrame()
lcc_results = {}
assortativity_results = {}
for language, graph in graphs.items():
    # dict setup
    degree_results[language] = {}
    lcc_results[language] = {}
    assortativity_results[language] = {}
    
    # degree
    degree_results[language]["all"] = graph.degree()
    degree_results[language]["neg"] = graph.degree(np.where(np.array(graph.vs["sentiment"]) == "negative")[0])
    degree_results[language]["pos"] = graph.degree(np.where(np.array(graph.vs["sentiment"]) == "positive")[0])
    
    # centralities
    centrality_df = pd.DataFrame()
    centrality_df["word"] = graph.vs["name"]
    centrality_df["sentiment"] = graph.vs["sentiment"]
    centrality_df["degree"] = graph.degree()
    centrality_df["betweenness"] = graph.betweenness(directed=False)
    centrality_df["closeness"] = graph.closeness()
    centrality_df["language"] = language
    centrality_results_df = centrality_results_df.append(centrality_df)
    
    # clustering coefficient
    lcc_results[language]["all"] = graph.transitivity_local_undirected(mode="0")
    lcc_results[language]["neg"] = graph.transitivity_local_undirected(np.where(np.array(graph.vs["sentiment"]) == "negative")[0], mode="0")
    lcc_results[language]["pos"] = graph.transitivity_local_undirected(np.where(np.array(graph.vs["sentiment"]) == "positive")[0], mode="0")
    
    # assortativity
    assortativity_results[language]["degree"] = graph.assortativity_degree(directed=False)
    assortativity_results[language]["sentiment"] = graph.assortativity("sent", directed=False)

### Degree

#### CDF Plots

In [None]:
for language in languages:
    fig, ax = plt.subplots(figsize=(5, 5))
    
    degree_all = degree_results[language]["all"]
    degree_neg = degree_results[language]["neg"]
    degree_pos = degree_results[language]["pos"]
    
    sns.kdeplot(degree_all, color="black", cumulative=True, ax=ax)
    sns.kdeplot(degree_neg, color="red", cumulative=True, ax=ax)
    sns.kdeplot(degree_pos, color="green", cumulative=True, ax=ax)
    
    ax.set_ylabel("CDF")
    ax.set_xlabel("Degree")
    
    plt.tight_layout()
    #plt.show()
    plt.savefig("results/plots/cdf_degree_{}.pdf".format(language))
    plt.close()

#### KS tests

In [None]:
for language in languages:
    degree_neg = degree_results[language]["neg"]
    degree_pos = degree_results[language]["pos"]
    
    print(language)
    print(ks_2samp(degree_neg, degree_pos))
    print()

### Centralities

#### Positive/negtive ratio

In [None]:
for language in languages:
    deg_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("degree", ascending=False).head(50)
    bet_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("betweenness", ascending=False).head(50)
    clo_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("closeness", ascending=False).head(50)
    
    print(language)
    print("degree")
    print(deg_top_words["sentiment"].value_counts() / 50)
    print("betweenness")
    print(bet_top_words["sentiment"].value_counts() / 50)
    print("closeness")
    print(clo_top_words["sentiment"].value_counts() / 50)
    print()

#### Wordclouds

In [None]:
for language in tqdm(languages):
    def color_words(word, font_size, position, orientation, random_state, font_path):
        return graphs[language].vs[graphs[language].vs["name"].index(word)]["color"]

    deg_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("degree", ascending=False).head(50)
    deg_top_words_dict = deg_top_words.set_index("word")["degree"].to_dict()
    
    bet_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("betweenness", ascending=False).head(50)
    bet_top_words_dict = bet_top_words.set_index("word")["betweenness"].to_dict()
    
    clo_top_words = centrality_results_df[centrality_results_df["language"] == language].sort_values("closeness", ascending=False).head(50)
    clo_top_words_dict = clo_top_words.set_index("word")["closeness"].to_dict()
    
    deg_wordcloud = WordCloud(font_path="arial.ttf", width=2000,height=1000, background_color="white", color_func=color_words).generate_from_frequencies(deg_top_words_dict)
    bet_wordcloud = WordCloud(font_path="arial.ttf", width=2000,height=1000, background_color="white", color_func=color_words).generate_from_frequencies(bet_top_words_dict)
    clo_wordcloud = WordCloud(font_path="arial.ttf", width=2000,height=1000, background_color="white", color_func=color_words).generate_from_frequencies(clo_top_words_dict)
    
    with open("results/plots/wordcloud_degree_{}.svg".format(language), "w") as svg_file:
        svg_file.write(deg_wordcloud.to_svg())
        
    with open("results/plots/wordcloud_betweenness_{}.svg".format(language), "w") as svg_file:
        svg_file.write(bet_wordcloud.to_svg())
        
    with open("results/plots/wordcloud_closeness_{}.svg".format(language), "w") as svg_file:
        svg_file.write(clo_wordcloud.to_svg())
        
    deg_drawing = svg2rlg("results/plots/wordcloud_degree_{}.svg".format(language))
    renderPDF.drawToFile(deg_drawing, "results/plots/wordcloud_degree_{}.pdf".format(language))
    
    bet_drawing = svg2rlg("results/plots/wordcloud_betweenness_{}.svg".format(language))
    renderPDF.drawToFile(bet_drawing, "results/plots/wordcloud_betweenness_{}.pdf".format(language))
    
    clo_drawing = svg2rlg("results/plots/wordcloud_closeness_{}.svg".format(language))
    renderPDF.drawToFile(clo_drawing, "results/plots/wordcloud_closeness_{}.pdf".format(language))

#### Proportion tests

In [None]:
num_smaller_sig = 0
num_larger_sig = 0
num_smaller_non_sig = 0 
num_larger_non_sig = 0
for centrality in ["degree", "betweenness", "closeness"]:
    for language in languages:
        language_df = centrality_results_df[centrality_results_df["language"] == language]
        temp_centrality_df =  language_df.sort_values(centrality, ascending=False).head(50)["sentiment"].to_frame()
        neg_count = temp_centrality_df["sentiment"].value_counts()["negative"]
        neg_net_ratio = neg_count / 50
        neg_graph_ratio = len(np.where(np.array(graphs[language].vs["sentiment"]) == "negative")[0]) / graphs[language].vcount()
        if neg_net_ratio < neg_graph_ratio:
            pvalue = proportions_ztest(count=neg_count, nobs=100, value=neg_graph_ratio, alternative="smaller")[1]
            alternative = "smaller"
        else:
            pvalue = proportions_ztest(count=neg_count, nobs=100, value=neg_graph_ratio, alternative="larger")[1]
            alternative = "larger"
            
        if pvalue < 0.05:
            sig = "significant"
        else:
            sig = "non-significant"
            
        if alternative == "smaller" and sig == "significant": num_smaller_sig += 1
        if alternative == "larger" and sig == "significant": num_larger_sig += 1
        if alternative == "smaller" and sig == "non-significant": num_smaller_non_sig += 1
        if alternative == "larger" and sig == "non-significant": num_larger_non_sig += 1
            
        print(centrality, language, neg_net_ratio, round(neg_graph_ratio, 2), alternative, pvalue)
print()
print(num_smaller_sig)
print(num_larger_sig)
print(num_smaller_non_sig)
print(num_larger_non_sig)

#### Correlations

In [None]:
for language in tqdm(languages):
    language_df = centrality_results_df[centrality_results_df["language"] == language][["degree", "betweenness", "closeness"]]
    
    language_df.columns = ["Degree", "Betweenness", "Closeness"]
    
    sns.pairplot(language_df, diag_kind ="kde")
    plt.tight_layout()
    plt.savefig("results/plots/centrality_corr_{}.pdf".format(language))
    plt.close()

In [None]:
for language in languages:
    language_df = centrality_results_df[centrality_results_df["language"] == language]
    
    print(language)
    print("deg2bet", spearmanr(language_df["degree"], language_df["betweenness"]))
    print("deg2clo", spearmanr(language_df["degree"], language_df["closeness"]))
    print("bet2clo", spearmanr(language_df["betweenness"], language_df["closeness"]))
    print()

### Local clustering coefficient

#### CDF plots

In [None]:
for language in languages:
    fig, ax = plt.subplots(figsize=(5, 5))
    
    lcc_all = lcc_results[language]["all"]
    lcc_neg = lcc_results[language]["neg"]
    lcc_pos = lcc_results[language]["pos"]
    
    sns.kdeplot(lcc_all, color="black", cumulative=True, ax=ax)
    sns.kdeplot(lcc_neg, color="red", cumulative=True, ax=ax)
    sns.kdeplot(lcc_pos, color="green", cumulative=True, ax=ax)
    
    ax.set_ylabel("CDF")
    ax.set_xlabel("Local Clustering Coefficient")
    
    plt.tight_layout()
    #plt.show()
    plt.savefig("results/plots/cdf_lcc_{}.pdf".format(language))
    plt.close()

#### Mean and median

In [None]:
for language in languages:
    lcc_all = lcc_results[language]["all"]
    lcc_neg = lcc_results[language]["neg"]
    lcc_pos = lcc_results[language]["pos"]
    
    print(language)
    print("all mean:", np.mean(lcc_all))
    print("all median:", np.median(lcc_all))
    print("negative mean:", np.mean(lcc_neg))
    print("negative median:", np.median(lcc_neg))
    print("positive mean:", np.mean(lcc_pos))
    print("positive median:", np.median(lcc_pos))
    print()

#### KS tests

In [None]:
for language in languages:
    lcc_neg = lcc_results[language]["neg"]
    lcc_pos = lcc_results[language]["pos"]
    
    print(language)
    print(ks_2samp(lcc_neg, lcc_pos))
    print()

### Assortativity

In [None]:
for language in languages:
    print(language)
    print("degree assortativity:", assortativity_results[language]["degree"])
    print("sentiment assortativity:", assortativity_results[language]["sentiment"])
    print()