# Mots

In [1]:
!pip install pandas
!pip install spacy
!pip install Counter

Collecting spacy
  Downloading spacy-3.8.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp312-cp312-manylinux_2_17_x86

In [None]:
import pandas as pd
import spacy
from collections import Counter

# Charger les données
df = pd.read_excel('tweets_fusionnes.xlsx')

# Ajouter une colonne YearMonth si elle n'existe pas déjà
if 'YearMonth' not in df.columns:
    df['YearMonth'] = df['ConvertedDate'].dt.strftime('%Y-%m')

# Charger le modèle SpaCy
nlp = spacy.load("en_core_web_sm")

# Stopwords personnalisés (ajouter ici les mots que vous voulez exclure)
custom_stopwords = {"https", "twitter", "elon", "musk", "people","quit","leave","stay", "like", "amp", "com", "user", "retweet", "x"}

def clean_and_tokenize_spacy(text):
    """
    Utiliser SpaCy pour tokeniser et nettoyer le texte.
    """
    if not isinstance(text, str):
        return []
    doc = nlp(text.lower())
    tokens = [
        token.lemma_  # Récupérer le lemme (forme de base du mot)
        for token in doc
        if not token.is_stop  # Exclure les stopwords de SpaCy
        and token.is_alpha  # Exclure les caractères non alphabétiques
        and token.lemma_ not in custom_stopwords  # Exclure les mots personnalisés
    ]
    return tokens

# Appliquer le nettoyage et la tokenisation avec SpaCy
df['CleanedText'] = df['Content'].apply(clean_and_tokenize_spacy)

# Grouper par mois et compter les mots
monthly_top_words = {}

for month, group in df.groupby('YearMonth'):
    all_words = [word for text in group['CleanedText'] for word in text]
    word_counts = Counter(all_words)
    top_words = word_counts.most_common(10)
    monthly_top_words[month] = top_words

# Convertir les résultats en DataFrame
result_df = pd.DataFrame.from_dict(monthly_top_words, orient='index', columns=[f'Word_{i+1}' for i in range(10)])
result_df.reset_index(inplace=True)
result_df.rename(columns={'index': 'YearMonth'}, inplace=True)

# Sauvegarder dans un fichier Excel
result_df.to_excel('top_words_per_month_spacy_filtered.xlsx', index=False)

print("Les 10 mots les plus récurrents par mois ont été calculés et sauvegardés dans 'top_words_per_month_spacy_filtered.xlsx'.")
