### Tanagra (new version)
## By Motasem ALRAHABI, ObTIC  Sorbonne Université, juin 2023
## Linkt: https://github.com/obtic-sorbonne/Tanagra2

This script do the following:
1.   Each text is segmented, sentence by sentence (with spacy).
2.   Each sentence is read and searched for all LOC named entities (with spacy).
3.   For each entity it finds the geographical coordinates (with geopy / geonames).
4.   If at least one named entity is found in the sentence, a function analyzes the positive, negative and neutral sentiments on the sentence level (using spacy).
5.   It Displays on a map the named entities found (with folium).
6.   The color of each icon is calculated according to the average sentiment of this entity in the whole corpus: green for positive, red for negative and gray for neutral.
7.   The size of each icon on the map is proportional to the number of occurrences of this entity in the whole corpus.
8.   For each icon, a popup shows the place name, the occurences number, the number of positive, negative and neutral sentiments in the whole corpus.
9.   I save each entity in an output csv file: file, sentence, sentiment, lat, long.

In [None]:
! pip install spacy geopy folium textblob transformers
! pip install geonamescache

import os
import csv
import spacy
from spacy import displacy
from textblob import TextBlob
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from folium.plugins import MarkerCluster
import geonamescache
import requests

In [None]:
# Charger le modèle français de spaCy
! python -m spacy download fr_core_news_sm
nlp = spacy.load('fr_core_news_sm')

# Charger la bdd geonames:
gc = geonamescache.GeonamesCache()

In [None]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
from transformers import pipeline
#sentiment_analysis_model = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", framework="tf")
sentiment_analysis_model = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", framework="pt")


# Essayer aussi https://huggingface.co/philschmid/pt-tblard-tf-allocine
# Essayer aussi https://huggingface.co/moussaKam/barthez
# Essayer aussi https://pypi.org/project/aspect-based-sentiment-analysis/

In [None]:
# Segment each text into sentences (in the whole corpus)
def segment_sentences(directory_path):
    all_sentences = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                doc = nlp(text)
                sentences = [sent.text for sent in doc.sents]
                all_sentences.extend(sentences)
    print(all_sentences)
    return all_sentences

In [None]:
# Analyzing each sentence:

blacklist = ["l’", "qu’", "m’", "t’", "s’", "L’"]

#with open("blacklist.txt", "r", encoding='utf-8') as file:
#  blacklist = [word.strip() for word in file.readlines()]

def analyze_sentence(sentence):
  geolocator = Nominatim(user_agent='my-app', timeout=10)
  doc = nlp(sentence)
  analyzed_sentence = []
  entities = []
  latitude = None
  longitude = None
  sentiment_score_sentence = 0
  sentiment_label_sentence = ""

  for ent in doc.ents:
      if ent.label_ == "LOC":
        if ent.text not in blacklist:
            entities.append(ent.text)

            # Geopy library will not be able to connect to the Nominatim server in order to geocode all locations, due to the connection timing out.
            # We can handle this issue in a few ways: Increase the timeout, Retry the request (try/except...) or use a DB like Geonames ou OpenStreetMap
            # pour le moment j'utilse un compteur à personnaliser "counter", voir plus loin.
            # Je vais utiliser geonamescache (geonames.db est plus complet mais plus lourd), et si l'info manque, j'appelle l'api geonames en ligne.


            # Try using Geonamescache as primary source
            matching_cities = gc.get_cities_by_name(ent.text)
            #essayer aussi: get_cities(), get_countries(), get_countries_by_names(), get_continents(), etc.
            if matching_cities:
                first_matching_city = list(matching_cities[0].values())[0]
                latitude = first_matching_city['latitude']
                longitude = first_matching_city['longitude']
            else:
                # Use Nominatim as fallback
                try:
                    location = geolocator.geocode(ent.text)
                    if location:
                        latitude = location.latitude
                        longitude = location.longitude
                except GeocoderTimedOut:
                    location = None

            if entities and latitude and longitude:
              sentiment_score_sentence = sentiment_analysis_model(sentence)[0]["score"]
              sentiment_label_sentence = sentiment_analysis_model(sentence)[0]['label']
            result = {
                "entity": ent.text,
                "sentiment_label_sentence": sentiment_label_sentence,
                "sentiment_score_sentence": sentiment_score_sentence,
                "latitude": latitude,
                "longitude": longitude
            }
            analyzed_sentence.append(result)

  return analyzed_sentence

In [None]:
# Texts

# Atention: le sentiment général est celui de la phrase dans son ensemble, et non pas des propositions autour des lieux !!!
# Pour le moment le sentiment est celui de l'ensemble des données en entrées:
    # si un seul text --> sentiment au niveau de ce texte.
    # si folder --> sentiment au niveau du folder

# on peut affiner plus tard pour traiter les sentiments de chaque fichier à part dans un folder !

from google.colab import drive
drive.mount('/content/drive')
all_sentences = segment_sentences("/content/drive/MyDrive/Colab_Notebooks/input/")

#uploaded = files.upload()
#filename = next(iter(uploaded.keys()))
#all_sentences = uploaded[filename].decode('utf-8')
#all_sentences = ["J'adore Marseille car Marseille est une belle ville. Je déteste Lyon. J'aime Lille"]
#all_sentences = segment_sentences("./input/")

In [None]:
# Traitement:
analyzed_sentences = []
# Save entity information to a CSV file
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter="\t")
    writer.writerow(['file', 'sentence', 'entity', 'label', 'score', 'lat', 'long'])
    counter = 0
    # Analyze each sentence
    for sentence in all_sentences:
      if counter>=1000: # le nombre maximum supporté par geonames
        break
      analyzed_sentence = analyze_sentence(sentence)
      for result in analyzed_sentence:
        entity = result["entity"]
        sentiment_label = result["sentiment_label_sentence"]
        sentiment_score = result["sentiment_score_sentence"]
        latitude = result["latitude"]
        longitude = result["longitude"]

        # check if latitude and longitude are None
        if latitude is None or longitude is None:
            print(f"Skipping sentence due to missing location data: {sentence}")
            continue  # skip this sentence and move to the next one



        writer.writerow(["-->", sentence, entity, sentiment_label, round(sentiment_score, 2), latitude, longitude])
        counter += 1
        print(f"Sentiment de la phrase {counter}: {sentence[:20]}... ; entité:[{entity}] ; polarity: {sentiment_label} ; score: {round(sentiment_score, 2)}")
        analyzed_sentences.append({
            'sentence': sentence,
            'entity': entity,
            'sentiment_label': sentiment_label,
            'sentiment_score': round(sentiment_score, 2),
            'latitude': latitude,
            'longitude': longitude
        })

# Iterate analyzed_sentences and create a new dictionary
entities_dict = {}
for sentence_info in analyzed_sentences:
    entity = sentence_info['entity']
    sentiment_label = sentence_info['sentiment_label']

    # Initialize entity information if it doesn't exist
    if entity not in entities_dict:
        entities_dict[entity] = {
            'latitude': sentence_info['latitude'],
            'longitude': sentence_info['longitude'],
            'occurrences': 1,
            'positive_labels': 0,
            'negative_labels': 0,
            'neutral_labels': 0,
            'overall_sentiment': sentiment_label
        }
    else:
        entities_dict[entity]['occurrences'] = entities_dict[entity]['occurrences'] + 1

    # Update sentiment label counts and overall sentiment
    if sentiment_label in ['4 stars', '5 stars']:
        entities_dict[entity]['positive_labels'] += 1
    elif sentiment_label in ['1 star', '2 stars']:
        entities_dict[entity]['negative_labels'] += 1
    elif sentiment_label == '3 stars':
        entities_dict[entity]['neutral_labels'] += 1

    # Update overall sentiment based on counts
    for entity in entities_dict:
      positive_count = entities_dict[entity]['positive_labels']
      negative_count = entities_dict[entity]['negative_labels']
      neutral_count = entities_dict[entity]['neutral_labels']

      if positive_count > negative_count and positive_count > neutral_count:
        entities_dict[entity]['overall_sentiment'] = 'Positive'
      elif negative_count > positive_count and negative_count > neutral_count:
        entities_dict[entity]['overall_sentiment'] = 'Negative'
      elif neutral_count > positive_count and neutral_count > negative_count:
        entities_dict[entity]['overall_sentiment'] = 'Neutral'
      else:
        entities_dict[entity]['overall_sentiment'] = 'Mixed'

print(entities_dict)


In [None]:
# Visualisation

# Initialize a map
map_center = [0, 0]  #[48.8566, 2.3522]
map_zoom = 2.5  # Adjust the zoom level as needed
map = folium.Map(location=map_center, zoom_start=map_zoom)

# Iterate over entities in entities_dict
for entity, info in entities_dict.items():
  #print("entity", entity)
  latitude = info['latitude']
  longitude = info['longitude']
  occurrences = info['occurrences']
  overall_sentiment = info['overall_sentiment']
  positive_labels = info['positive_labels']
  negative_labels = info['negative_labels']
  neutral_labels = info['neutral_labels']

  # Determine icon color based on overall sentiment
  if overall_sentiment == 'Positive':
    color = 'green'
  elif overall_sentiment == 'Negative':
    color = 'red'
  elif overall_sentiment == 'Neutral':
    color = 'gray'
  elif overall_sentiment == 'Mixed':
    color = 'blueviolet'
  else:
    color = 'blue'
  #print(overall_sentiment,"  ", color)


  scaling_factor = 1  # Adjust this value to control the marker size scaling
  size = int(occurrences) * scaling_factor



  # Create a popup with entity information
  popup = f"<b>Entity:</b> {entity}<br>"
  popup += f"<b>Occurrences:</b> {occurrences}<br>"
  popup += f"<b>Sentiment:</b> {overall_sentiment}<br>"
  popup += f"<b>Positive:</b> {positive_labels}<br>"
  popup += f"<b>Negative:</b> {negative_labels}<br>"
  popup += f"<b>Neutral:</b> {neutral_labels}<br>"

  folium.CircleMarker(
      location=[latitude, longitude],
      radius=size,
      popup=popup,
      color=color,
      fill=True,
      fill_color=color,
      #tooltip=f"Entity: {entity}"
      tooltip=entity
  ).add_to(map)

#map.save("map.html")
map