## Introduction

In a world where the negative seems to come out more easily than the positive, one wonders if this has an impact on the population. For this we can use the literature as a barometer, as it gives us an idea of what emotion is most prevalent in the population.
We wonder which lexicon is the most used in french litterature, the love or the hate one ?

# Dataprocessing

## Config

In [None]:
# Set to True if working in Amazon SageMaker
SAGEMAKER = False

In [None]:
%%capture

import os

APPS_HOME      = os.getcwd() + "/apps"

SPARK_VERSION  = "3.0.0"
HADOOP_VERSION = "2.7"
AUT_VERSION    = "0.91.0"
JAVA_VERSION   = "11"

SPARK_HADOOP_VERSION = "spark-{}-bin-hadoop{}".format(SPARK_VERSION, HADOOP_VERSION)

if SAGEMAKER:
    !sudo amazon-linux-extras install java-openjdk11 -y
    os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-11.0.16.0.8-1.amzn2.0.1.x86_64"
else:
    !apt-get install openjdk-"$JAVA_VERSION"-jdk-headless
    os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-{}-openjdk-amd64".format(JAVA_VERSION)

!pip install -q findspark

!wget https://archive.apache.org/dist/spark/spark-"$SPARK_VERSION"/"$SPARK_HADOOP_VERSION".tgz
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION".zip
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION"-fatjar.jar

!tar -xf "$SPARK_HADOOP_VERSION".tgz
!mkdir -p "$APPS_HOME"
!mv spark-* aut-* "$APPS_HOME"

!rm -rf sample_data "$APPS_HOME"/"$SPARK_HADOOP_VERSION".tgz


## Spark init

In [None]:
import os
import findspark

SPARK_DRIVER_MEMORY   = "8g"

os.environ["SPARK_HOME"] = "{}/{}".format(APPS_HOME, SPARK_HADOOP_VERSION)   
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory {0} --jars {2}/aut-{1}-fatjar.jar --py-files {2}/aut-{1}.zip pyspark-shell'.format(SPARK_DRIVER_MEMORY, AUT_VERSION, APPS_HOME)

findspark.init()

In [None]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, col, udf
from pyspark.sql.types import StringType

sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

sc

## Downloading datasets

In [None]:
%%capture 
!pip install -q gdown

# !gdown https://drive.google.com/drive/folders/1xqDsY5KOeK5OMhW39EH37l79Pn-v59B_?usp=sharing -O ./LIFRANUM/autre --folder
# !gdown https://drive.google.com/drive/folders/170j3r23YJBlOpGsKrcZRSs3bqrS03qhi?usp=sharing -O ./LIFRANUM/cartoweb --folder
# !gdown https://drive.google.com/drive/folders/1NLuWLOldfmpwPeAr9Th_HCeH6ZoSw0zr?usp=sharing -O ./LIFRANUM/lifranum-method --folder
!gdown https://drive.google.com/drive/folders/1wehg3nnCks9iVIvuXMZ5u685ocq__dQe?usp=sharing -O ./LIFRANUM/repo-ecritures-num --folder

## Creating dataframe

In [None]:
!pip install tldextract

In [None]:
from aut import *
import tldextract
WARCs_path = "LIFRANUM/repo-ecritures-num/*.warc*"

In [None]:
from urllib.parse import urlparse

@udf("string")
def extract_hostname(s):
    return urlparse(s).hostname

@udf("string")
def extract_url_domain(s):
  return tldextract.extract(s).domain

@udf("string")
def extract_url_subdomain(s):
  return tldextract.extract(s).subdomain

@udf("string")
def extract_url_tld(s):
  return tldextract.extract(s).suffix

@udf("string")
def extract_url_registered_domain(s):
  return tldextract.extract(s).registered_domain

@udf("string")
def extract_url_domain_reversed(s):
  text=urlparse(s).hostname
  t=text.split('.')
  t.reverse()
  text='.'.join(t)
  return text

@udf("string")
def extract_port(s):
  return urlparse(s).port

@udf("string")
def extract_requete(s):
  return urlparse(s).scheme

df2 = WebArchive(sc, sqlContext, WARCs_path).webpages()\
  .withColumn("hostname", extract_hostname("url") )\
  .withColumn("url_domain", extract_url_domain("url"))\
  .withColumn("url_subdomain", extract_url_subdomain("url"))\
  .withColumn("url_tld", extract_url_tld("url"))\
  .withColumn("url_domain_reversed", extract_url_domain_reversed("url"))\
  .withColumn("url_registered_domain", extract_url_registered_domain("url"))\
  .withColumn("port", extract_port("url"))\
  .withColumn("requete", extract_requete("url"))
df2.show(1, True);

## Data formatting

In [None]:
from aut import remove_html, remove_http_header
import re

def count_words(res, wordsLists):
  charToReplace = {
      "'": " ",
      ",": " ",
      "/": " ",
      ".": " ",
      ":": " ",
      ";": " ",
      "!": " ",
      "?": " ",
      "\"": " ",
      "é": "e",
      "è": "e",
      "ê": "e",
      "-": " ",
      "à": "a",
  }

  # building dictionnary
  ws_feeling = {}
  lists_counts = {}

  for i in res:
    # cleaning data
    domain = i[0].lower()
    cleanedData = i[1].lower()
    for char in charToReplace:
      cleanedData = cleanedData.replace(char, charToReplace[char])
    ttlRowLen = len(cleanedData.split())
    # dictionary domain init
    if domain in ws_feeling:
      ws_feeling[domain]["tot_word"] = ws_feeling[domain]["tot_word"] + ttlRowLen
    else:
      ws_feeling[domain] = {"tot_word": ttlRowLen}
    
    # counting words for all words lists
    for i, aList in enumerate(wordsLists):
      index_str = str(i)

      for word in aList:
        wordCount = len(re.findall(r"(?i)\ "+ word + r"\b", cleanedData))

        if wordCount > 0:
          #creating dictionary in lists_counts if doesn't exists
          if index_str not in lists_counts:
            lists_counts[index_str] = {}
          # adding count to count storage for this list
          if word in lists_counts[index_str]:
            lists_counts[index_str][word] = lists_counts[index_str][word] + wordCount
          else:
            lists_counts[index_str][word] = wordCount
          # adding count to website list
          if domain in ws_feeling:
            if index_str in ws_feeling[domain]:
              ws_feeling[domain][index_str] = ws_feeling[domain][index_str] + wordCount
            else:
              ws_feeling[domain][index_str] = wordCount
          else:
            ws_feeling[domain] = {index_str: wordCount}
  
  # adding 0 entries keys to dictionary
  nbOfWordsLists = len(wordsLists)
  for domain in ws_feeling:
    for i in range(nbOfWordsLists):
      if str(i) not in ws_feeling[domain]:
        ws_feeling[domain][str(i)] = 0

  return (ws_feeling, lists_counts)

In [None]:
# Fetching data from df

res = df2\
.withColumn('text', remove_html(remove_http_header('content')))\
.select(['url_domain', 'text'])\
.take(5000)

In [None]:
# Fetching love & hate lexical fields

!gdown https://drive.google.com/drive/folders/1M2-DSdt641PMtG1yLGVDERszuigh2h_t?usp=share_link -O ./champsLexical --folder

fileRead=open("./champsLexical/AmourTmp.txt")
texte=fileRead.read()
fileRead.close()

loveList=sorted(texte.split('\n'))
n=len(loveList)
for k in range(0,n):
  loveList[k]=loveList[k][:-1]

fileRead=open("./champsLexical/HaineTmp.txt")
texte=fileRead.read()
fileRead.close()

hateList=sorted(texte.split('\n'))
n=len(hateList)
for k in range(0,n):
  hateList[k]=hateList[k][:-1]

In [None]:
# count_words() inputs
wordsLists = [loveList, hateList] # loveList = 0, hateList = 1

In [None]:
dic_website_feeling, lists_counts = count_words(res, wordsLists)
dic_nb_love_word = lists_counts['0']
dic_nb_hate_word = lists_counts['1']

In [None]:
print(dic_website_feeling,"\n",dic_nb_love_word)

# Datavisualisation

## Import

In [None]:
!pip install matplotlib

In [None]:
import matplotlib.pyplot as plt
import numpy as np

## Data manipulation

Exemple de dictionnaires (penser a mettre les vraies informations)

In [None]:
#Ce dictionnaire contient pour chaque site le nombre d'occurence de mots du champ lexical amour et haine
#dic_website_feeling = {"www.google.com" : {"love" : 90, "hate" : 7, "tot_word" : 150},"www.facebook.com" : {"love" : 10, "hate" : 22, "tot_word" : 568},"www.facebak.com" : {"love" : 13, "hate" : 12, "tot_word" : 243}}

#Ces deux dictionnaires doivent contenir le top 5 des mots les plus représentés de leur champ lexical respectif. (/!\ Ne pas mettre tous les mots sinon le graphe ne sera pas lisible /!\)
#dic_nb_love_word = {"heart" : 19, "happy" : 2, "joy" : 31}
#dic_nb_hate_word = {"kill" : 13, "murder" : 8, "hate" : 54}

In [None]:
#Création du dictionnaire pour le troisième graphe

top_occurence_website = {
    'website' : [],
    'occurence' : [],
    'color' : []
}
top_occurence_bubble = {
    'website' : [],
    'occurence' : [],
    'color' : []
}

for i in dic_website_feeling:
    top_occurence_website['website'].append(i)
    top_occurence_website['occurence'].append((dic_website_feeling[i]["0"] + dic_website_feeling[i]["1"])/dic_website_feeling[i]["tot_word"])
    top_occurence_website["color"].append('#%02x%02x%02x' % (int(255*dic_website_feeling[i]["0"] / (1 + (dic_website_feeling[i]["0"] + dic_website_feeling[i]["1"]))),130,130))
    if ((dic_website_feeling[i]["0"] + dic_website_feeling[i]["1"])==0):
      continue
    top_occurence_bubble['website'].append(i)
    top_occurence_bubble['occurence'].append((dic_website_feeling[i]["0"] + dic_website_feeling[i]["1"])/dic_website_feeling[i]["tot_word"])
    top_occurence_bubble["color"].append('#%02x%02x%02x' % (int(255*dic_website_feeling[i]["0"] / (1 + (dic_website_feeling[i]["0"] + dic_website_feeling[i]["1"]))),130,130))

In [None]:
def normalizer_hate_love(dic):
    normalized_dic = {}
    for i in dic:
        if (dic[i]["0"] + dic[i]["1"]) == 0:
            normalized_dic[i] = (0.5,0)
        else:
            normalized_dic[i] = (dic[i]["0"] / (dic[i]["0"] + dic[i]["1"]), (dic[i]["0"] + dic[i]["1"])) 
    return normalized_dic

In [None]:
#Création du dictionnaire pour le premier graphe

dic_norm = normalizer_hate_love(dic_website_feeling)

In [None]:
#Création des listes pour réaliser le deuxième graphe

y_nhl = [dic_norm[i][0] for i in dic_norm]
x_nhl = [dic_norm[i][1] for i in dic_norm]

#h_nl = [dic_nb_love_word[i] for i in dic_nb_love_word]
#name_l = [i for i in dic_nb_love_word]
#h_nh = [dic_nb_hate_word[i] for i in dic_nb_hate_word]
#name_h = [i for i in dic_nb_hate_word]

sorted_love = sorted(dic_nb_love_word, key=dic_nb_love_word.get, reverse=True)[:5]
sorted_love_values = [dic_nb_love_word[i] for i in sorted_love]
sorted_hate = sorted(dic_nb_hate_word, key=dic_nb_hate_word.get, reverse=True)[:5]
sorted_hate_values = [dic_nb_hate_word[i] for i in sorted_hate]


In [None]:
#Création d'une class pour réaliser notre bubble chart

class BubbleChart:
    def __init__(self, area, bubble_spacing=0):
        """
        Setup for bubble collapse.

        Parameters
        ----------
        area : array-like
            Area of the bubbles.
        bubble_spacing : float, default: 0
            Minimal spacing between bubbles after collapsing.

        Notes
        -----
        If "area" is sorted, the results might look weird.
        """
        area = np.asarray(area)
        r = np.sqrt(area / np.pi)

        self.bubble_spacing = bubble_spacing
        self.bubbles = np.ones((len(area), 4))
        self.bubbles[:, 2] = r
        self.bubbles[:, 3] = area
        self.maxstep = 2 * self.bubbles[:, 2].max() + self.bubble_spacing
        self.step_dist = self.maxstep / 2

        # calculate initial grid layout for bubbles
        length = np.ceil(np.sqrt(len(self.bubbles)))
        grid = np.arange(length) * self.maxstep
        gx, gy = np.meshgrid(grid, grid)
        self.bubbles[:, 0] = gx.flatten()[:len(self.bubbles)]
        self.bubbles[:, 1] = gy.flatten()[:len(self.bubbles)]

        self.com = self.center_of_mass()

    def center_of_mass(self):
        return np.average(
            self.bubbles[:, :2], axis=0, weights=self.bubbles[:, 3]
        )

    def center_distance(self, bubble, bubbles):
        return np.hypot(bubble[0] - bubbles[:, 0],
                        bubble[1] - bubbles[:, 1])

    def outline_distance(self, bubble, bubbles):
        center_distance = self.center_distance(bubble, bubbles)
        return center_distance - bubble[2] - \
            bubbles[:, 2] - self.bubble_spacing

    def check_collisions(self, bubble, bubbles):
        distance = self.outline_distance(bubble, bubbles)
        return len(distance[distance < 0])

    def collides_with(self, bubble, bubbles):
        distance = self.outline_distance(bubble, bubbles)
        idx_min = np.argmin(distance)
        return idx_min if type(idx_min) == np.ndarray else [idx_min]

    def collapse(self, n_iterations=50):
        """
        Move bubbles to the center of mass.

        Parameters
        ----------
        n_iterations : int, default: 50
            Number of moves to perform.
        """
        for _i in range(n_iterations):
            moves = 0
            for i in range(len(self.bubbles)):
                rest_bub = np.delete(self.bubbles, i, 0)
                # try to move directly towards the center of mass
                # direction vector from bubble to the center of mass
                dir_vec = self.com - self.bubbles[i, :2]

                # shorten direction vector to have length of 1
                dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))

                # calculate new bubble position
                new_point = self.bubbles[i, :2] + dir_vec * self.step_dist
                new_bubble = np.append(new_point, self.bubbles[i, 2:4])

                # check whether new bubble collides with other bubbles
                if not self.check_collisions(new_bubble, rest_bub):
                    self.bubbles[i, :] = new_bubble
                    self.com = self.center_of_mass()
                    moves += 1
                else:
                    # try to move around a bubble that you collide with
                    # find colliding bubble
                    for colliding in self.collides_with(new_bubble, rest_bub):
                        # calculate direction vector
                        dir_vec = rest_bub[colliding, :2] - self.bubbles[i, :2]
                        dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))
                        # calculate orthogonal vector
                        orth = np.array([dir_vec[1], -dir_vec[0]])
                        # test which direction to go
                        new_point1 = (self.bubbles[i, :2] + orth *
                                      self.step_dist)
                        new_point2 = (self.bubbles[i, :2] - orth *
                                      self.step_dist)
                        dist1 = self.center_distance(
                            self.com, np.array([new_point1]))
                        dist2 = self.center_distance(
                            self.com, np.array([new_point2]))
                        new_point = new_point1 if dist1 < dist2 else new_point2
                        new_bubble = np.append(new_point, self.bubbles[i, 2:4])
                        if not self.check_collisions(new_bubble, rest_bub):
                            self.bubbles[i, :] = new_bubble
                            self.com = self.center_of_mass()

            if moves / len(self.bubbles) < 0.1:
                self.step_dist = self.step_dist / 2

    def plot(self, ax, labels, colors):
        """
        Draw the bubble plot.

        Parameters
        ----------
        ax : matplotlib.axes.Axes
        labels : list
            Labels of the bubbles.
        colors : list
            Colors of the bubbles.
        """
        for i in range(len(self.bubbles)):
            circ = plt.Circle(
                self.bubbles[i, :2], self.bubbles[i, 2], color=colors[i])
            ax.add_patch(circ)
            ax.text(*self.bubbles[i, :2], labels[i],
                    horizontalalignment='center', verticalalignment='center')

## Visualisation

In [None]:
#max(x_nhl)

plt.plot(x_nhl,y_nhl, 'ro')
plt.axis([0,4000,0,1])
plt.title("Représentation de la tendance des sites à parler d'amour ou de haine")
plt.xlabel("Rapport du nombre de mot amour/haine sur nombre de mots total")
plt.ylabel("Score")
plt.show()

*   score = 0.5: amour et haine en quantité égale
*   score < 0.5 : prédominance de haine
*   score > 0.5: prédominance d'amour



In [None]:
plt.xticks(rotation='vertical')
plt.bar(sorted_love, sorted_love_values, width=0.8, bottom=None, align='center')
plt.bar(sorted_hate, sorted_hate_values, width=0.8, bottom=None, align='center', color="red")
plt.ylabel("Nombre d'occurence")
plt.title("Les mots les plus représentés des champs lexicaux")
plt.show()

In [None]:
bubble_chart = BubbleChart(area=top_occurence_bubble['occurence'],
                           bubble_spacing=0.1)

bubble_chart.collapse()

fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"))
bubble_chart.plot(
    ax, top_occurence_bubble['website'], top_occurence_bubble['color'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title('Top occurence website')

plt.show()
plt.savefig("test.svg")

Ce graphe représente les sites les plus évocateurs des champs lexicaux amour/haine (plus grosses bulles) ainsi que leur penchant pour l'un des deux champs lexicaux (de bleu = haine à rose = amour en passant par gris = pas prédominance).


## Conclusion

Finally, we can see that websites talking about French literature are more likely to use a lexicon close to love than to hate, and that sites using a hateful lexical field are still less numerous than sites talking about love