In [1]:
from opsci_toolbox.helpers.common import load_pickle, write_pickle, write_json
from opsci_toolbox.helpers.dataviz import generate_hexadecimal_color_palette
from tqdm import tqdm
%load_ext cudf.pandas
import pandas as pd
import pytz
from datetime import datetime
import re
from opsci_toolbox.helpers.nlp import *

## Fonction intermédiaire pour générer les clés de jointure
def generate_index(df, col_author_id ='author_id', col_date='created_time'):
    res=[]
    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"): 
        new_index=".".join([ str(i) for i in [ row[col_author_id], row[col_date].year, row[col_date].month, row[col_date].day]])
        res.append(new_index)
    df["index"]=res
    
    return df
                     
def avg_performance(df, 
                    col_date='created_time', 
                    col_author_id='author_id', 
                    col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha', 
                                    'sad', 'angry','total_engagement', 'replies', 'percentage_replies'], 
                    rolling_period='7D'):
                     
    # Nettoyage au cas où
    df[col_date] = pd.to_datetime(df[col_date]) 
    df = df.sort_values([col_author_id, col_date]) 

    # Le point central c'est la colone created_time, on la met en index.
    # Ensuite on groupe par author_id en gardant les colonnes de valeurs.
    # On applique la moyenne mean sur un rolling tous les 2 jours. Automatiquement il va prendre l'index, ici created_time comme pivot. 
    # On met tout à plat
    average = df.set_index(col_date).groupby(col_author_id)[col_engagement].rolling(rolling_period).mean(numeric_only=True).reset_index()
                     
    # Sur les résultats précédent, on simplifie pour récupérer une liste avec juste la liste jour / author_id
    average = average.set_index(col_date).groupby([col_author_id]).resample('1D').last(numeric_only=True).reset_index()

    # On génère nos supers index
    df=generate_index(df, col_author_id =col_author_id, col_date=col_date)    
    
    average = generate_index(average, col_author_id = col_author_id, col_date=col_date)

    # On fusionne 
    df = pd.merge(df, average[['index']+col_engagement], how='left', on=['index'], suffixes=('', '_avg'))
    
    return df

def kpi_reaction(df, cols):
    """
    Cette fonction prend un dataframe et une liste de colonnes en entrée.
    Pour chaque colonne, on va calculer le taux de sur-réaction.
    """
    for col in cols:
        df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
    return df

def get_reactions_type(df, cols, col_dest):
    all_val=[]
    
    for i,row in tqdm(df.iterrows(), total=df.shape[0], desc="qualification des posts"):
        
        str_val=''
        count=0
        for col in cols:
            if row[col]>0:
                str_val=str_val+' '+col.replace('tx_', 'sur-')
                count=count+1
        if count==0:
            str_val="sous reaction"
        if count==len(cols):
            str_val="sur reaction totale"
            
        all_val.append(str_val.strip())
            
    df[col_dest]=all_val       
    return df

def remove_brackets(text, replacement=""):
    # pattern = r'\b(?:train|ter|eurostar|t[h]?al[iy]s|tgv|intercité[s]?|transilien[s]?)*\s*?(n[°]?|num[ée]ro[s]?|num)?\s*?\d+\b'
    pattern = r'\[.*?\]'
    result = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return result


2024-05-15 09:27:35.738063: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 09:27:36.190977: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
import pandas as pd 

df = pd.read_pickle("/home/erwan/scripts/st_pr_v2/data/df_prod_v2.pickle")

In [4]:
from opsci_toolbox.helpers.common import load_pickle

d = load_pickle("/home/erwan/scripts/st_pr_v2/data/df_prod_v2.pickle")

In [3]:
from opsci_toolbox.helpers.common import write_pickle


write_pickle(df, "/home/erwan/scripts/st_pr_v2/data", "df_prod_v2")

'/home/erwan/scripts/st_pr_v2/data/df_prod_v2.pickle'

In [2]:
# col_user_id = "origin_id1"
# col_post_id = "uniq_id"
col_date = "message_date"


cols_sureaction_metrics = ['views', 'engagements', 'total_reactions', 'replies_count', 'forwards']

cols_typologie_sureaction=['views', "total_reactions", "forwards"]

rolling_period_sureaction = '7D'

start_date = datetime(2024, 1, 1, tzinfo=pytz.UTC) 
end_date = datetime(2024, 4, 1, tzinfo=pytz.UTC)

spacy_lang = "en"                                       #language of the stopwords
spacy_model = "en_core_web_lg"                         # spacy model to import : ru_core_news_lg, en_core_web_lg, fr_core_news_lg
pos_to_keep = ["VERB","NOUN","ADJ", "ADV", "PROPN"] 

In [2]:

df = pd.read_pickle("data/df.pickle")


In [14]:
df = df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)]

In [None]:
df["date"]=df["datetime"].dt.date

df= avg_performance(
    df, 
    col_date=col_date, 
    col_author_id="channel", 
    col_engagement= cols_sureaction_metrics, 
    rolling_period=rolling_period_sureaction
    ) 

# on calcule les taux de sur-réaction pour notre liste de métriques
df=kpi_reaction(df, cols_sureaction_metrics)
cols_tx_engagement=['tx_'+c for c in cols_sureaction_metrics]
df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)


# on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
df.drop(columns=cols_to_drop, inplace=True)

# on catégorise les formes de réaction
cols_typologie = ["tx_"+ col for col in cols_typologie_sureaction]
df=get_reactions_type(df, cols_typologie, 'type_engagement')


In [16]:
df_stopwords = load_stopwords_df(spacy_lang)
stopwords = df_stopwords['word'].to_list()

nlp = load_spacy_model(spacy_model,  disable_components=["transformer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=True)

# basic precleaning of text 
print("TEXT PRECLEANING")
df = TM_clean_text(df, "translated_text", "clean_text")

df["clean_text"] = df["clean_text"].apply(remove_brackets)
df["clean_text"] = df["clean_text"].apply(remove_extra_spaces)

# lemmatize text, remove stop words and keep only some PoS
print("NLP PROCESS")
df = TM_nlp_process(nlp, df, "clean_text", "lemmatized_text", pos_to_keep, stopwords, batch_size=100, n_process=1, stats=False, join_list = True) 

TEXT PRECLEANING
NLP PROCESS


NLP Process: 100%|██████████| 49641/49641 [07:04<00:00, 117.02it/s]


In [17]:
reaction_color_palette = generate_hexadecimal_color_palette(df["type_engagement"].unique())
channel_color_palette = generate_hexadecimal_color_palette(df["channel"].unique())

df["channel_color"]=df["channel"].map(channel_color_palette)
df["surreaction_color"]=df["type_engagement"].map(reaction_color_palette)
write_json(channel_color_palette, "data", "channel_color_palette")
write_json(reaction_color_palette, "data", "reaction_color_palette")

'data/reaction_color_palette.json'

In [18]:
embedding_model = "BAAI/bge-m3"                 #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3",  "DeepPavlov/rubert-base-cased"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"batch_size":2}

HF_encoder = load_HF_embeddings(embedding_model, encode_kwargs, model_kwargs)

embeddings = HF_vectorize(HF_encoder, df["translated_text"])

df['embeddings'] = embeddings

cols_to_keep = ['origin_id1', 'channel_id', "channel", "channel_description", 'message_id', "uniq_id", "message_date", "date", "datetime", "text", "translated_text", 
                "is_reply", 'views', 'forwards', 'replies_count', 'total_reactions', "engagements", "sentiment", 'tx_views',
                'tx_engagements', 'tx_total_reactions', 'tx_replies_count',
                'tx_forwards', 'type_engagement', "channel_color", "surreaction_color", "lemmatized_text", "embeddings"]


df=df[cols_to_keep]

write_pickle(df, "data", "df_prod_chroma")

'data/df_prod_chroma.pickle'

In [3]:
from opsci_toolbox.helpers.nlp import spacy_NER
spacy_lang = "ru"                                       #language of the stopwords
spacy_model = "ru_core_news_lg"                         # spacy model to import : ru_core_news_lg, en_core_web_lg, fr_core_news_lg
pos_to_keep = ["VERB","NOUN","ADJ", "ADV", "PROPN"] 

nlp = load_spacy_model(spacy_model,  disable_components=["transformer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_type'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_text'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_start_char'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [5]:

df_ner = spacy_NER(nlp, df, "text", entities_to_keep=['PER','ORG', 'LOC'], explode= True)

: 

In [None]:
df_ner

In [5]:
import chromadb

chroma_client = chromadb.PersistentClient(path="/home/erwan/scripts/bertopic/chroma")



embedding_model = "BAAI/bge-m3"                 #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3",  "DeepPavlov/rubert-base-cased"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {'batch_size':32}

df["date"] = df["datetime"].dt.strftime("%Y-%m-%d")

cols_metadata = ['origin_id1', 'channel_id', "channel", 'message_id', "date", "is_reply", 'views', 'forwards', 'replies_count', 'total_reactions', "engagements", "sentiment", 'tx_views', 'tx_engagements', 'tx_total_reactions', 'tx_replies_count', 'tx_forwards', 'type_engagement', "translated_text"]
col_text = "text"
col_id = "uniq_id"

collection = chroma_client.create_collection(name="my_collection")

df_filter = df.sample(10)
lst_text = list(df_filter[col_text])
lst_ids = list(df_filter[col_id])
metadatas =  df_filter[cols_metadata].to_dict(orient="records")


collection.add(
    embeddings=embeddings,
    documents=lst_text,
    metadatas=metadatas,
    ids=lst_ids
)

: 