In [35]:
!mkdir /content/drive/MyDrive/AI/QAG/enrichment/qag_json/
!unzip /content/drive/MyDrive/AI/QAG/enrichment/qag_json.zip -d /content/drive/MyDrive/AI/QAG/enrichment/qag_json

mkdir: cannot create directory ‘/content/drive/MyDrive/AI/QAG/enrichment/qag_json/’: File exists
Archive:  /content/drive/MyDrive/AI/QAG/enrichment/qag_json.zip
replace /content/drive/MyDrive/AI/QAG/enrichment/qag_json/qag_json/QANR5L16QG1.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [36]:
import pandas as pd
from tqdm.auto import tqdm

In [37]:
!python -m spacy download fr_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [38]:
import glob
import json
import spacy

nlp = spacy.load("fr_core_news_sm")


In [45]:
def count_pronouns(doc):
    male_pronouns = ["il", "lui", "son", "sa", "ses", "le", "ce"]
    female_pronouns = ["elle", "lui", "sa", "son", "ses", "la", "cette"]
    
    male_count = 0
    female_count = 0
    pronoun_positions = []

    for token in doc:
        if token.text.lower() in male_pronouns:
            male_count += 1
            pronoun_positions.append({"pronoun": token.text, "position": token.idx})
        elif token.text.lower() in female_pronouns:
            female_count += 1
            pronoun_positions.append({"pronoun": token.text, "position": token.idx})
    
    return male_count, female_count, pronoun_positions


In [46]:
def process_json_file(input_file_path, output_file_path):
    with open(input_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    text = data["question"]["textesReponse"]["texteReponse"]["texte"]
    doc = nlp(text)

    male_pronouns_count, female_pronouns_count, pronoun_positions = count_pronouns(doc)
    total_pronouns_count = male_pronouns_count + female_pronouns_count

    male_pronouns_percentage = (male_pronouns_count / total_pronouns_count) * 100 if total_pronouns_count > 0 else 0
    female_pronouns_percentage = (female_pronouns_count / total_pronouns_count) * 100 if total_pronouns_count > 0 else 0

    data["question"]["textesReponse"]["texteReponse"]["male_pronouns_count"] = male_pronouns_count
    data["question"]["textesReponse"]["texteReponse"]["female_pronouns_count"] = female_pronouns_count
    data["question"]["textesReponse"]["texteReponse"]["male_pronouns_percentage"] = male_pronouns_percentage
    data["question"]["textesReponse"]["texteReponse"]["female_pronouns_percentage"] = female_pronouns_percentage
    data["question"]["textesReponse"]["texteReponse"]["pronoun_positions"] = pronoun_positions

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


In [47]:
def process_files(input_file_paths, output_folder):
    for input_file_path in tqdm(input_file_paths):
        output_file_path = f"{output_folder}/{input_file_path.split('/')[-1]}"
        process_json_file(input_file_path, output_file_path)


In [48]:
!rm -rf /content/drive/MyDrive/AI/QAG/enrichment/processed/*

In [49]:
input_json_files = glob.glob("/content/drive/MyDrive/AI/QAG/enrichment/qag_json/qag_json/*.json")
output_folder = "/content/drive/MyDrive/AI/QAG/enrichment/processed"
process_files(input_json_files, output_folder)


  0%|          | 0/639 [00:00<?, ?it/s]

# enrich 2

In [50]:
!pip install transformers torch


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [51]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")


def perform_sentiment_analysis(input_file_path, output_file_path):
    with open(input_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    text = data["question"]["textesReponse"]["texteReponse"]["texte"]
    doc = nlp(text)

    sentiment_data = []

    for sent in doc.sents:
        sentiment = sentiment_pipeline(sent.text)[0]
        label = sentiment["label"]
        score = sentiment["score"]

        stars = 0
        if label == "LABEL_0":
            stars = 1
        elif label == "LABEL_1":
            stars = 3
        elif label == "LABEL_2":
            stars = 5

        sentiment_data.append({
            "begin_char": sent.start_char,
            "end_char": sent.end_char,
            "sentiment": stars,
            "score": score
        })

    data["question"]["textesReponse"]["texteReponse"]["sentiment_data"] = sentiment_data

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [52]:
from tqdm.auto import tqdm
import os
import glob

input_folder = "/content/drive/MyDrive/AI/QAG/enrichment/processed"
output_folder = "/content/drive/MyDrive/AI/QAG/enrichment/processed_2"

os.makedirs(output_folder, exist_ok=True)

input_files = glob.glob(os.path.join(input_folder, "*.json"))

for input_file in tqdm(input_files):
    file_name = os.path.basename(input_file)
    output_file = os.path.join(output_folder, file_name)
    perform_sentiment_analysis(input_file, output_file)


  0%|          | 0/639 [00:00<?, ?it/s]

# enrich 3

In [None]:
# still not tested code
from spacy import displacy

def enrich_with_relevant_metrics(input_file_path, output_file_path):
    with open(input_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    text = data["question"]["textesReponse"]["texteReponse"]["texte"]
    doc = nlp(text)

    # Named Entity Recognition
    entities = [{"start_char": ent.start_char, "end_char": ent.end_char, "label": ent.label_, "text": ent.text} for ent in doc.ents]
    data["question"]["textesReponse"]["texteReponse"]["entities"] = entities

    # Key Phrases Extraction
    noun_phrases = [{"start_char": chunk.start_char, "end_char": chunk.end_char, "text": chunk.text} for chunk in doc.noun_chunks]
    data["question"]["textesReponse"]["texteReponse"]["noun_phrases"] = noun_phrases

    # Sentiment Polarity
    sentiment_data = []
    for sent in doc.sents:
        sentiment = sentiment_pipeline(sent.text)[0]
        label = sentiment["label"]
        score = sentiment["score"]

        polarity = 0
        if label == "LABEL_0":
            polarity = -1
        elif label == "LABEL_1":
            polarity = 0
        elif label == "LABEL_2":
            polarity = 1

        sentiment_data.append({
            "begin_char": sent.start_char,
            "end_char": sent.end_char,
            "polarity": polarity,
            "score": score,
        })

    data["question"]["textesReponse"]["texteReponse"]["sentiment_data"] = sentiment_data

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


  0%|          | 0/639 [00:00<?, ?it/s]

NameError: ignored

In [None]:
from tqdm.auto import tqdm
import os
import glob

input_folder = "/content/drive/MyDrive/AI/QAG/enrichment/processed_2"
output_folder = "/content/drive/MyDrive/AI/QAG/enrichment/processed_3"

os.makedirs(output_folder, exist_ok=True)

input_files = glob.glob(os.path.join(input_folder, "*.json"))

for input_file in tqdm(input_files):
    file_name = os.path.basename(input_file)
    output_file = os.path.join(output_folder, file_name)
    enrich_with_relevant_metrics(input_file, output_file)
