In [1]:
import pandas as pd
from opsci_toolbox.helpers.common import load_pickle, write_pickle, load_csv, create_dir, read_json, write_json
from opsci_toolbox.helpers.nlp import load_stopwords_df, load_spacy_model, TM_clean_text, PRarmy_nlp_process, load_HF_embeddings, HF_vectorize
from opsci_toolbox.apis.webscraping import url_get_domain, parallel_twitter_oembed
from opsci_toolbox.helpers.surreaction import *
import os
from datetime import datetime
import pytz

project = "TWITTER_LISTS_USERS_COMMUNS"
path = "/home/erwan/scripts/st_pr_v3/notebooks/OUTPUTS"

config_file = "/home/erwan/scripts/st_pr_v3/notebooks/configs/config_twitter_lists_users_communs.json"

# POUR FILTRER SUR UNE PERIODE DONNEE
start_date = datetime(2024, 6, 1, tzinfo=pytz.UTC) 
end_date = datetime(2024, 7, 15, tzinfo=pytz.UTC)

# SPACY CONFIG
spacy_lang = "en"                                       #language of the stopwords
spacy_model = "en_core_web_lg"                         # spacy model to import : ru_core_news_lg, en_core_web_lg, fr_core_news_lg
pos_to_keep = ["VERB","NOUN","ADJ", "ADV", "PROPN"] 
entities_to_keep = ['PERSON','ORG', 'LOC']

### VECTORISATION
embedding_model = "all-MiniLM-L6-v2"        
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {'batch_size':8}


# CHEMIN VERS LES CORPUS
# corpus_name = "PATRIOTES"
# corpus_file = "/home/erwan/scripts/bertopic/OUTPUTS/PR_army_v8/df_with_translatdion.pickle"
# twitter_corpus = "/home/erwan/scripts/bertopic/OUTPUTS/PR_army_v8_twitter_listes_users_communs/df_with_translation.pickle"
# df_patriots = load_pickle("/home/erwan/scripts/bertopic/OUTPUTS/PR_army_v8_twitter/df_with_translation.pickle")

2024-07-18 19:20:25.464182: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-18 19:20:25.495191: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Environnement

In [2]:
config = read_json(os.path.join(path, config_file))
path_project = create_dir(os.path.join(path, project))

# Nettoyage

In [3]:
df = load_pickle(config["corpus_file"])

# On supprime les doublons
df = df.drop_duplicates(subset=config['col_id'])

# FILTRE SUR LES DATES
df = df[(df[config['col_date']] >= start_date) & (df[config['col_date']] <= end_date)]
df['date'] = df[config['col_date']].dt.strftime('%Y-%m-%d %H:%M:%S')
df.drop(columns=[config['col_date']], inplace = True)

# on ajoute des colonnes
df['plateforme'] = config['plateforme']
df['corpus_name'] = config['corpus_name']

# on calcule la somme des engagements
df['engagements'] = df[config['cols_engagements']].fillna(0).sum(axis=1)

# on ne conserve qu'une liste de colonnes correspondantes à celles du mapping
cols_to_keep = list(config['col_mapping'].keys()) + ['plateforme', 'corpus_name', 'engagements', 'date'] 
df = df[cols_to_keep]

# on rempli si vide et on change le type
for col, values in config['col_mapping'].items():
    df[col] = df[col].fillna(values[1]).astype(values[0])

# on renomme les colonnes
new_columns_names = {col: values[2] for col, values in config['col_mapping'].items()}
df.rename(columns=new_columns_names, inplace=True)

# on conserve uniquement les lignes avec du texte
df = df[df['text'].str.len()>0]

# on extrait le nom de domaine des URLs
df["domain"]=df["url"].apply(url_get_domain)
df = df.reset_index(drop=True)

######################
# SUREACTION
######################

df[config["cols_sureaction_metrics"]]=df[config["cols_sureaction_metrics"]].astype(int)

df = avg_performance(
    df, 
    col_date="date", 
    col_author_id="user_id", 
    col_engagement = config["cols_sureaction_metrics"], 
    rolling_period= config["rolling_period_sureaction"]
    ) 

# on calcule les taux de sur-réaction pour notre liste de métriques
df=kpi_reaction(df, config["cols_sureaction_metrics"])
cols_tx_engagement=['tx_'+c for c in config["cols_sureaction_metrics"]]
df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)


# on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
cols_to_drop += ['index']
df.drop(columns=cols_to_drop, inplace=True)

# on catégorise les formes de réaction
cols_typologie = ["tx_"+ col for col in config["cols_typologie_sureaction"]]
df=get_reactions_type(df, cols_typologie, 'type_engagement')

df.drop(columns=cols_typologie, inplace=True)

df.to_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))  

  df['engagements'] = df[config['cols_engagements']].fillna(0).sum(axis=1)
  df[col] = df[col].fillna(values[1]).astype(values[0])
  df[col] = df[col].fillna(values[1]).astype(values[0])
  df[col] = df[col].fillna(values[1]).astype(values[0])
generation des index: 100%|██████████| 199562/199562 [00:04<00:00, 45081.40it/s]
generation des index: 100%|██████████| 13957/13957 [00:00<00:00, 43209.19it/s]
qualification des posts: 100%|██████████| 199562/199562 [00:04<00:00, 47709.38it/s]


# Récupération des Embed Tweets

In [None]:
if 'df' not in locals():
    df = pd.read_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))

if config['plateforme']=="Twitter":
    usernames = df['user_name'].tolist()
    tweet_ids = df['message_id'].tolist()
    df_embed = parallel_twitter_oembed(usernames, tweet_ids, omit_script=True)
    df = pd.merge(df, df_embed, on=['user_name',"message_id"], how='left')
else:
    df["tweet_html"] = None

df.to_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))

# Lemmatisation & NER

In [5]:
if 'df' not in locals():
    df = pd.read_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))

df_stopwords = load_stopwords_df(spacy_lang)
stopwords = df_stopwords['word'].to_list()

nlp = load_spacy_model(spacy_model,  disable_components=["transformer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=True)

# basic precleaning of text 
print("TEXT PRECLEANING")
df = TM_clean_text(df, "translated_text", "clean_text")

# lemmatize text, remove stop words and keep only some PoS
print("NLP PROCESS")
df = PRarmy_nlp_process(nlp, df, "clean_text", "lemmatized_text", pos_to_keep, entities_to_keep, stopwords, batch_size=100, n_process=1) 

df.drop(columns=['clean_text'], inplace=True)

df.to_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))  

TEXT PRECLEANING
NLP PROCESS


NLP Process: 100%|██████████| 199562/199562 [04:12<00:00, 789.76it/s] 


# Vectorization

In [None]:
if 'df' not in locals():
    df = pd.read_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))

HF_encoder = load_HF_embeddings(embedding_model, encode_kwargs, model_kwargs)

embeddings = HF_vectorize(HF_encoder, list(df["translated_text"]))

df["embeddings"]=embeddings
df.to_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))  

In [None]:
# from FlagEmbedding import BGEM3FlagModel

# model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False) 

# embeddings = model.encode(list(df["translated_text"]), 
#                             batch_size=8, 
#                             max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
#                             )['dense_vecs']

# df["embeddings"]=embeddings.tolist()
# df.to_parquet(os.path.join(path_project, f'{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}_{config["plateforme"]}_{config["corpus_name"]}.parquet'))  