- Remove punctuation and stop words
- Tokenize the cleaned tweets
- Compute token count, type count, and token-to-type ratio (TTR)
- Lemmatize each token
- Count lemma frequency
- Show the 20 most common lemmas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q spacy
!python -m spacy download fr_core_news_sm
import pandas as pd

import spacy
from collections import Counter

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from collections import Counter
import pandas as pd
import spacy

# Load French spaCy model
nlp = spacy.load("fr_core_news_sm")

# Read cleaned French tweets
df = pd.read_csv('/content/drive/My Drive/_thesis_dogwhistles/Analyze-tweets/left-cleaned-french-tweets.csv')

# Prepare text for spaCy processing
texts = df['content'].dropna().astype(str).tolist()

# Tokenize, remove stopwords, punctuation, numbers, and lemmatize
all_tokens = []
all_lemmas = []

for doc in nlp.pipe(texts, disable=["ner", "parser"]):
    for token in doc:
        if token.is_alpha and not token.is_stop:
            all_tokens.append(token.text.lower())
            all_lemmas.append(token.lemma_.lower())

# Count tokens, types, TTR
total_tokens = len(all_tokens)
unique_tokens = len(set(all_tokens))
ttr = total_tokens / unique_tokens if unique_tokens > 0 else 0

# Lemma frequency
lemma_freq = Counter([lemma for lemma in all_lemmas if lemma.strip()])

# Report
print("Tokenization and Lemmatization Report")
print(f"Total tokens (excluding stopwords, punctuation, numbers): {total_tokens}")
print(f"Unique tokens (types): {unique_tokens}")
print(f"Token-to-Type Ratio (TTR): {ttr:.2f}")

print("\nTop 20 Most Frequent Lemmas:")
for lemma, freq in lemma_freq.most_common(20):
    print(f"{lemma}: {freq}")


Tokenization and Lemmatization Report
Total tokens (excluding stopwords, punctuation, numbers): 1434864
Unique tokens (types): 92355
Token-to-Type Ratio (TTR): 15.54

Top 20 Most Frequent Lemmas:
faire: 10012
être: 9664
bien: 8731
france: 6359
falloir: 5919
non: 5753
bon: 5713
voir: 5687
rien: 5236
vouloir: 5043
contre: 4536
an: 4262
français: 4228
oui: 4186
monde: 3687
savoir: 3675
politique: 3614
pouvoir: 3609
avoir: 3594
prendre: 3582


In [None]:
with open('/content/drive/My Drive/_thesis_dogwhistles/Analyze-tweets/left-lemmas.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_lemmas))

30 most frequently used nouns and adjectives

In [None]:
import spacy
from collections import Counter

# Load French spaCy model
nlp = spacy.load("fr_core_news_sm")

# Process content column
texts = df['content'].dropna().astype(str).tolist()
nouns = []
adjectives = []

for doc in nlp.pipe(texts, batch_size=500, disable=["ner", "parser"]):
    for token in doc:
        if not token.is_stop and not token.is_punct:
            if token.pos_ == "NOUN":
                nouns.append(token.lemma_.lower())
            elif token.pos_ == "ADJ":
                adjectives.append(token.lemma_.lower())

# Count frequencies
noun_freq = Counter(nouns).most_common(30)
adj_freq = Counter(adjectives).most_common(30)

# Display results
print("📚 Top 30 Most Frequent Nouns:")
for word, freq in noun_freq:
    print(f" - {word}: {freq}")

print("\n🎨 Top 30 Most Frequent Adjectives:")
for word, freq in adj_freq:
    print(f" - {word}: {freq}")

📚 Top 30 Most Frequent Nouns:
 - an: 4261
 - 🤣: 3956
 - monde: 3541
 - oui: 3467
 - jour: 3378
 - droite: 2944
 - pays: 2819
 - temps: 2745
 - gauche: 2645
 - vie: 2547
 - fois: 2516
 - femme: 2488
 - année: 2438
 - droit: 2400
 - enfant: 2342
 - c: 2280
 - question: 2249
 - ️: 2241
 - chose: 2207
 - histoire: 2192
 - place: 2138
 - travail: 1957
 - problème: 1813
 - personne: 1799
 - homme: 1789
 - état: 1786
 - français: 1762
 - moment: 1732
 - guerre: 1708
 - cas: 1672

🎨 Top 30 Most Frequent Adjectives:
 - bon: 5702
 - grand: 3481
 - petit: 2743
 - français: 2463
 - nouveau: 2077
 - dernier: 2062
 - politique: 1992
 - public: 1919
 - extrême: 1905
 - gros: 1683
 - social: 1328
 - bel: 1274
 - vrai: 1268
 - européen: 1024
 - 😭: 1002
 - premier: 1001
 - meilleur: 998
 - fait: 975
 - national: 971
 - prochain: 960
 - sûr: 948
 - faux: 896
 - cher: 867
 - international: 829
 - 👉: 813
 - jeune: 765
 - grave: 728
 - important: 722
 - 🤣: 710
 - simple: 701


In [None]:
import pandas as pd
import re # Import the regular expressions library

# Define the file paths
file_path_c1 = '/content/drive/MyDrive/_thesis_dogwhistles/Compare-embeddings/corpora/c1-left.csv'
file_path_c2 = '/content/drive/MyDrive/_thesis_dogwhistles/Compare-embeddings/corpora/c2-right.csv'

try:
    df_c1 = pd.read_csv(file_path_c1)
    df_c2 = pd.read_csv(file_path_c2)

    # --- Define a function for clean word counting ---
    # This regex finds sequences of French/English letters.
    # It will ignore numbers, emojis, and standalone punctuation.
    def count_clean_words(text):
        if not isinstance(text, str):
            return 0
        # For French, we include À-ÿ to catch accented characters
        words = re.findall(r'\b[a-zA-ZÀ-ÿ]+\b', text.lower())
        return len(words)

    # --- Calculate clean word count for both corpora ---
    total_words_c1 = df_c1['content'].apply(count_clean_words).sum()
    total_words_c2 = df_c2['content'].apply(count_clean_words).sum()


    # --- Print the results ---
    print("Clean Corpus Word Count Results 💡")
    print("-" * 35)
    print(f"Total words in c1 (neutral corpus): {total_words_c1}")
    print(f"Total words in c2 (ideological corpus): {total_words_c2}")
    print("-" * 35)

except FileNotFoundError as e:
    print(f"❌ Error: A file was not found at {e.filename}")
except KeyError:
    print("❌ Error: A column named 'content' was not found in one of the CSV files.")

Clean Corpus Word Count Results 💡
-----------------------------------
Total words in c1 (neutral corpus): 3343187
Total words in c2 (ideological corpus): 3426165
-----------------------------------
