## Import

In [None]:
import pandas as pd
import spacy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import igraph as ig
import nltk
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from scipy.stats import spearmanr, ks_2samp

## Config

In [None]:
dataframe_input_path = "data/texts.p"
sentiment_dir = "data/sentiment/" # use / at the end

In [None]:
tqdm.pandas()

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc('ps',fonttype = 42)
plt.rc('pdf',fonttype = 42)
plt.rcParams.update({'font.size': 20})
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['axes.unicode_minus'] = False

In [None]:
languages = ["French", "German", "Italian", "Portuguese", "Spanish"]

## Load data

### Texts

In [None]:
texts_df = pd.read_pickle(dataframe_input_path)

### Spacy NLP

In [None]:
de_nlp = sc.load("de_core_news_sm")
fr_nlp = sc.load("fr_core_news_sm")
es_nlp = sc.load("es_core_news_sm")
it_nlp = sc.load("it_core_news_sm")
pt_nlp = sc.load("pt_core_news_sm")

nlp_to_use = {
    "French": fr_nlp,
    "German": de_nlp,
    "Italian": it_nlp,
    "Portuguese": pt_nlp,
    "Spanish": es_nlp
}

### Sentiment dictionaries

In [None]:
sentiment_lexica = {}
for lang in languages:
    sentiment_lexica[lang] = {}
    with open("{}negative_words_{}.txt".format(sentiment_dir, lang.lower()), "r") as fr:
        sentiment_lexica[lang]["neg"] = fr.read().splitlines()
    with open("{}positive_words_{}.txt".format(sentiment_dir, lang.lower()), "r") as fr:
        sentiment_lexica[lang]["pos"] = fr.read().splitlines()

## Prepare data

### Fix authors

In [None]:
author_fix = {
    "Bachiller D. P. Gatell": "Bachiller D. P. Gatell.",
    "Eliza Haywood": "Eliza Fowler Haywood",
}
texts_df["author"] = texts_df["author"].replace(author_fix)

### Fix language

In [None]:
texts_df["language"] = texts_df["language"].replace("Spanish; Castilian", "Spanish")

### Fix years

In [None]:
texts_df["date"] = texts_df["date"].apply(lambda x: x.split("-")[0])
texts_df["date"] = texts_df["date"].apply(lambda x: x.split(" [")[0])
texts_df["date"] = texts_df["date"].apply(lambda x: x.split(" bzw.")[0])

### Reduce to defined languages

In [None]:
texts_df = texts_df[texts_df["language"].isin(languages)]

### Calculate Sentiment

In [None]:
def analyze_sentiment(text, nl, pl):
    tokens = nltk.word_tokenize(text)
    num_negative = 0
    num_positive = 0
    for nw in nl:
        num_negative += tokens.count(nw.lower())
    for pw in pl:
        num_positive += tokens.count(pw.lower())
    try:
        score = (num_positive - num_negative) / (num_positive + num_negative)
    except ZeroDivisionError:
        score = 0
    return score

In [None]:
texts_df["sentiment"] = 0
for language in languages:
    lang_df = texts_df.loc[texts_df["language"] == language]
    neg_lexicon = sentiment_lexica[language]["neg"]
    pos_lexicon = sentiment_lexica[language]["pos"]
    scores = lang_df["text"].progress_apply(analyze_sentiment, args=[neg_lexicon, pos_lexicon])
    texts_df["sentiment"].update(scores)

## Dataset statistics

In [None]:
for language in languages:
    lang_df = texts_df.loc[texts_df["language"] == language]
    journal_group = lang_df.groupby("filename")
    authors = lang_df["author"].unique()
    num_authors = len(authors)
    if "Anonym" in authors:
        num_authors -= 1
        num_anonymus = journal_group.apply(lambda x: 1 if all(x["author"] == "Anonym") else 0).sum()
    else:
        num_anonymus = 0
    topics = lang_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
    years = lang_df["date"].unique()
        
    print(language)
    print("num authors:",  num_authors)
    print("num_anonymous:", num_anonymus)
    print("num journals:", len(journal_group))
    print("num text passages:",  lang_df.shape[0])
    print("num topics:",  len(np.unique(topics)))
    print("years:", np.min(years), np.max(years))
    print()

## Text Sentiment

In [None]:
texts_df["sentiment"] = texts_df["sentiment"].astype(float)

### Years

In [None]:
for language in languages:
    language_df = texts_df[texts_df["language"] == language]
    #print(language_df)
    fig, ax = plt.subplots(figsize=(10,2.5))
    sns.lineplot(data=language_df, x="date", y="sentiment", ax=ax)
    plt.draw()
    ax.set_xlabel("Years")
    ax.set_ylabel("Mean Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

### Narrative forms

In [None]:
for language in languages:
    language_df = texts_df[texts_df["language"] == language].copy()
    
    # standardize
    language_df["sentiment"] = language_df["sentiment"] - language_df["sentiment"].mean()
    language_df["sentiment"] = language_df["sentiment"] / language_df["sentiment"].std()
    fig, ax = plt.subplots(figsize=(10,5))
    sns.pointplot(data=language_df, x="ndf", y="sentiment", ax=ax, marker="s", join=False)
    plt.draw()
    ax.set_xlabel("Narrative Form")
    ax.set_ylabel("Mean Standardized Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

### Topics

In [None]:
topics = texts_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
t_s_df = pd.merge(topics, texts_df[["sentiment", "language"]], left_index=True, right_index=True)

In [None]:
for language in languages:
    language_df = t_s_df[t_s_df["language"] == language].copy()
    language_df["sentiment"] = language_df["sentiment"] - language_df["sentiment"].mean()
    language_df["sentiment"] = language_df["sentiment"] / language_df["sentiment"].std()
    fig, ax = plt.subplots(figsize=(10,10))
    sns.pointplot(data=language_df, x="value", y="sentiment", ax=ax, marker="s", join=False)
    plt.draw()
    ax.set_xlabel("Topic")
    ax.set_ylabel("Mean Standardized Sentiment")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.show()
    plt.close()

## Sentiment Networks

## Sentiment Word Networks

### Lemmatize

In [None]:
def lemmatize(row):
    lang = row["language"]
    if lang not in nlp_to_use.keys():
        return ""
    doc = nlp_to_use[lang](row["text"])
    tokens = []
    for t in doc:
        tokens.append(t.lemma_)
    return " ".join(tokens)

texts_df["tokens"] = texts_df.progress_apply(lemmatize, axis=1)

### Create graphs

In [None]:
graphs = {}
for language in languages:
    language_df = texts_df[texts_df["language"] == language]
    vectorizer = CountVectorizer(max_df=0.9)
    frequencies = vectorizer.fit_transform(language_df["tokens"])
    frequencies_df = pd.DataFrame(frequencies, columns=vectorizer.get_feature_names())
    
    
    

### Calculate metrics

In [None]:
degree_results = {}
centrality_results_df = pd.DataFrame()
lcc_results = {}
assortativity_results = {}
for language, graph in graphs.items():
    # degree
    degree_results[language]["all"] = graph.degree()
    degree_results[language]["neg"] = graph.degree(np.where(np.array(graph.vs["sentiment"]) == "negative")[0])
    degree_results[language]["pos"] = graph.degree(np.where(np.array(graph.vs["sentiment"]) == "positive")[0])
    
    # centralities
    centrality_df = pd.DataFrame()
    centrality_df["word"] = graph.vs["name"]
    centrality_df["degree"] = graph.degree()
    centrality_df["betweenness"] = graph.betweenness(directed=False)
    centrality_df["closeness"] = graph.closeness()
    centrality_df["language"] = language
    centrality_results_df = centrality_results_df.append(centrality_df)
    
    # clustering coefficient
    lcc_results[language]["all"] = graph.transitivity_local_undirected()
    lcc_results[language]["neg"] = graph.transitivity_local_undirected(np.where(np.array(graph.vs["sentiment"]) == "negative")[0])
    lcc_results[language]["pos"] = graph.transitivity_local_undirected(np.where(np.array(graph.vs["sentiment"]) == "positive")[0])
        
    # assortativity
    assortativity_results[language]["degree"] = graph.assortativity_degree(directed=False)
    assortativity_results[language]["sentiment"] = graph.assortativity("sent", directed=False)

### Degree

#### CDF Plots

In [None]:
for language in languages:
    fig, ax = plt.subplots(figsize=(5, 5))
    
    degree_all = degree_results[language]["all"]
    degree_neg = degree_results[language]["neg"]
    degree_pos = degree_results[language]["pos"]
    
    sns.kdeplot(degree_all, color="black", cumulative=True, ax=ax)
    sns.kdeplot(degree_neg, color="red", cumulative=True, ax=ax)
    sns.kdeplot(degree_pos, color="green", cumulative=True, ax=ax)
    
    ax.set_ylabel("CDF")
    ax.set_xlabel("Degree")
    
    plt.tight_layout()
    plt.show()
    plt.close()

#### KS tests

In [None]:
for language in languages:
    degree_neg = degree_results[language]["neg"]
    degree_pos = degree_results[language]["pos"]
    
    print(language)
    print(ks_2samp(degree_neg, degree_pos))
    print()

### Centralities

### Local clustering coefficient

#### Mean and median

In [None]:
for language in languages:
    lcc_all = lcc_results[language]["all"]
    lcc_neg = lcc_results[language]["neg"]
    lcc_pos = lcc_results[language]["pos"]
    
    print(language)
    print("all mean:", np.mean(lcc_all))
    print("all median:", np.median(lcc_all))
    print("negative mean:", np.mean(lcc_neg))
    print("negative median:", np.median(lcc_neg))
    print("positive mean:", np.mean(lcc_pos))
    print("positive median:", np.median(lcc_pos))
    print()

#### KS tests

In [None]:
for language in languages:
    lcc_neg = lcc_results[language]["neg"]
    lcc_pos = lcc_results[language]["pos"]
    
    print(language)
    print(ks_2samp(lcc_neg, lcc_pos))
    print()

### Assortativity

In [None]:
for language in languages:
    print(language)
    print("degree assortativity:", assortativity_results[language]["degree"])
    print("sentiment assortativity:", assortativity_results[language]["sentiment"])
    print()