# Sentiment analysis 

## 1. Textblob-FR

Documentation: https://textblob.readthedocs.io/en/dev/

### Imports

In [None]:
import sys
import os
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer

### Création d'une fonction `get_sentiment`

In [None]:
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

def get_sentiment(input_text):
    blob = tb(input_text)
    polarity, subjectivity = blob.sentiment
    polarity_perc = f"{100*abs(polarity):.0f}"
    subjectivity_perc = f"{100*subjectivity:.0f}"
    if polarity > 0:
        polarity_str = f"{polarity_perc}% positive"
    elif polarity < 0:
        polarity_str = f"{polarity_perc}% negative"
    else:
        polarity_str = "neutral"
    if subjectivity > 0:
        subjectivity_str = f"{subjectivity}% subjective"
    else:
        subjectivity_str = "perfectly objective"
    print(f"This text is {polarity_str} and {subjectivity_str}.")

### Analyser le sentiment d'une phrase

In [None]:
# Choisir une année
year = 1900

In [None]:
# Lister les fichiers de cette année
data_path = '../data'
txt_path = '../data/txt'
txts = [f for f in os.listdir(txt_path) if os.path.isfile(os.path.join(txt_path, f)) and str(year) in f]
len(txts)

In [None]:
# Stocker le contenu de ces fichiers dans une liste
content_list = []
for txt in txts:
    with open(os.path.join(txt_path, txt), 'r', encoding='utf-8') as f:
        content_list.append(f.read())

In [None]:
# Compter le nombre d'éléments (=fichiers) dans la liste
len(content_list)

In [None]:
# Liste pour stocker les phrases
selected_phrases = []

# Compteur pour les documents et les phrases
document_count = 0
phrase_count = 0

# Longueur minimale requise pour une phrase
min_phrase_length = 30

# Parcour des 10 premiers fichiers de la liste
for document in content_list[:10]:
    # Division du fichier en trois parties égales
    part_length = len(document) // 3
    
    # Phrases au début
    start_phrases = []
    current_phrase = ""
    for char in document[:part_length]:
        current_phrase += char
        if char in ['.', '!', '?']:
            if len(current_phrase) >= min_phrase_length:
                start_phrases.append(current_phrase)
                current_phrase = ""
    
    # Phrases au milieu
    middle_phrases = []
    current_phrase = ""
    for char in document[part_length:2*part_length]:
        current_phrase += char
        if char in ['.', '!', '?']:
            if len(current_phrase) >= min_phrase_length:
                middle_phrases.append(current_phrase)
                current_phrase = ""
    
    # Phrases à la fin
    end_phrases = []
    current_phrase = ""
    for char in document[2*part_length:]:
        current_phrase += char
        if char in ['.', '!', '?']:
            if len(current_phrase) >= min_phrase_length:
                end_phrases.append(current_phrase)
                current_phrase = ""
    
    # Sélection de 3 phrases au début, 3 au milieu et 4 à la fin
    selected_phrases.extend(start_phrases[:3])
    selected_phrases.extend(middle_phrases[:3])
    selected_phrases.extend(end_phrases[:4])
    
    # Mise à jour des compteurs
    document_count += 1
    phrase_count += len(start_phrases[:3]) + len(middle_phrases[:3]) + len(end_phrases[:4])
    
    if phrase_count >= 10:
        break


In [None]:
 selected_phrases

In [None]:
text = "\n".join(selected_phrases[:10])

In [None]:
get_sentiment("Les journaux publient la dépêche suivante du camp de Frère, 31 : Les Boers ont établi un nouveau camp formé do 63 wagons.")

In [None]:
get_sentiment(text)

## 2. Utilisation de transformers

Documentation: https://github.com/TheophileBlard/french-sentiment-analysis-with-bert

**!!** Si le code ne tourne pas sur votre machine, vous pouvez le tester directement sur Google Colab en utilisant [ce lien](https://colab.research.google.com/github/TheophileBlard/french-sentiment-analysis-with-bert/blob/master/colab/french_sentiment_analysis_with_bert.ipynb) **!!**

Le modèle peut également être testé en ligne sur [HuggingFace](https://huggingface.co/tblard/tf-allocine)

### Installation des librairies et imports

In [None]:
!pip install tensorflow
!pip install sentencepiece
!pip install transformers
!pip install spacy-transformer

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting tensorflow-intel==2.15.0 (from tensorflow)
  Downloading tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow-intel==2.15.0->tensorflow)
  Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow-intel==2.15.0->tensorflow)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras<2.16,>=2.15.0 (from tensorflow-intel==2.15.0->tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow-intel==2.15.0->tensorflow)
  Downloading protobuf-4.23.4-cp310-abi3-win_amd64.whl.metadata (540 bytes)
Downloading tensorflow-2.15.0-cp311-cp311-win_amd64.whl (2.1 kB)
Downloading tensorflow_intel-2.15.

ERROR: Exception:
Traceback (most recent call last):
  File "D:\Cours ULB\Stic-2\STIC-B545 - Traitement automatique de corpus\Tp_traite_corpus2\tac\tac_venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "D:\Cours ULB\Stic-2\STIC-B545 - Traitement automatique de corpus\Tp_traite_corpus2\tac\tac_venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "D:\Cours ULB\Stic-2\STIC-B545 - Traitement automatique de corpus\Tp_traite_corpus2\tac\tac_venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "D:\Cours ULB\Stic-2\STIC-B545 - Traitement automatique de corpus\Tp_traite_corpus2\tac\tac_venv\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 102, in read
    self.__buf.write(data)
  File

### Chargement du modèle

In [None]:
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine", use_pt=True)
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")

sentiment_analyser = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

### Analyser le sentiment d'une phrase

In [None]:
sentiment_analyser("Ce journal est vraiment super intéressant.")

In [None]:
sentiment_analyser("Cette phrase est négative et je ne suis pas content !")