In [1]:
import pandas as pd
import numpy as np
import os
import re

###  **Expanded MAGA-Related Keywords**
We should include:
- Direct identifiers:  
  `"maga"`, `"ultra maga"`, `"super maga"`, `"mega maga"`, `"maga nation"`
- Trump campaign references:  
  `"trump 2024"`, `"trump won"`, `"trump train"`, `"trump supporter"`
- Hashtags:  
  `#maga`, `#ultramaga`, `#trumpwon`, `#trump2024`, `#americafirst`, `#kag`, `#wwg1wga`, `#savetrump`
- Movement slogans:  
  `"save america"`, `"america first"`, `"stop the steal"`, `"patriot party"`

In [2]:
df = pd.read_csv("data/English_10.csv")

# Define MAGA-related keywords with word boundaries ===
maga_keywords = [
    r"\bmaga\b", r"\bultra\s?maga\b", r"\bmega\s?maga\b", r"\bsuper\s?maga\b",
    r"\bmaga\s?nation\b", r"\btrump\s?2024\b", r"\btrump\s?supporter\b",
    r"\btrump\s?train\b", r"\btrump\s?won\b", r"\bamerica\s?first\b",
    r"\bsave\s?america\b", r"\bstop\s?the\s?steal\b", r"\bpatriot\s?party\b",
    r"#maga\b", r"#ultramaga\b", r"#trumpwon\b", r"#trump2024\b",
    r"#americafirst\b", r"#kag\b", r"#wwg1wga\b", r"#savetrump\b"
]
pattern = '|'.join(maga_keywords)

possible_profile_fields = ['author.name', 'author.username', 'author.description']
profile_fields = [col for col in possible_profile_fields if col in df.columns]

df['combined_profile'] = df[profile_fields].astype(str).agg(' '.join, axis=1)

def clean_text(text):
    text = re.sub(r'[\n\r\t]+', ' ', text)
    return text.encode('ascii', errors='ignore').decode().strip()

df['combined_profile'] = df['combined_profile'].apply(clean_text)

def is_mostly_english(text):
    ascii_ratio = sum(c.isascii() for c in text) / len(text) if text else 0
    return ascii_ratio > 0.8

df_english = df[df['combined_profile'].apply(is_mostly_english)]

df_maga_profiles = df_english[df_english['combined_profile'].str.contains(pattern, case=False, na=False)]

# df_maga_profiles.to_csv("identified_maga_profiles_cleaned.csv", index=False)

  df = pd.read_csv("data/English_10.csv")


In [3]:
df_maga_profiles

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,lang,author_id,reply_settings,edit_history_tweet_ids,in_reply_to_user_id,created_at,referenced_tweets,text,...,attachments.poll_ids,attachments.poll.id,attachments.poll.options,entities.cashtags,in_reply_to_user.withheld.country_codes,in_reply_to_user.entities.description.cashtags,withheld.copyright,withheld.country_codes,author.withheld.country_codes,combined_profile
4,386686,156012,en,2473039477,everyone,['1499084102787547146'],,2022-03-02T18:07:56.000Z,"[{'type': 'retweeted', 'id': '1499081634376826...",RT @marcorubio: Many potential long term conse...,...,,,,,,,,,,"David DavidPBacon1 work hard, small government..."
101,196218,196218,en,49547787,everyone,['1498306803376934914'],,2022-02-28T14:39:13.000Z,"[{'type': 'retweeted', 'id': '1498054158376751...",RT @gordoncorera: This is not the Cuban Missil...,...,,,,,,,,,,"Skooter Mom SkooterMom LOVES GOD, FAMILY & AME..."
155,546887,10412,en,28273116,everyone,['1560057850218569730'],,2022-08-18T00:15:50.000Z,"[{'type': 'quoted', 'id': '1559936705884463106...",And they say Trump is the friend of Russia. B...,...,,,,,,,,,,"Joyreaper joyreaper Wife, Mother, Bible Teache..."
189,847387,153086,en,1223609616656949251,everyone,['1577515499222622210'],,2022-10-05T04:26:18.000Z,"[{'type': 'retweeted', 'id': '1577016989901783...",RT @M_Millerman: Elon musk: I'm trying to avoi...,...,,,,,,,,,,ULTRA MAGA BILL will63004265 Trump voter #Trum...
244,51111,51111,en,1067873015105490944,everyone,['1496876643406786564'],,2022-02-24T15:56:17.000Z,"[{'type': 'retweeted', 'id': '1496875735616827...",RT @JackPosobiec: Russia has gone full Russia\...,...,,,,,,,,,,Patrik Rick_42_ Retweet doesn't mean endorseme...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93853,493922,18067,en,752703965528219648,everyone,['1530089906852270080'],,2022-05-27T07:33:56.000Z,,newsmax: Washington and the world are holding ...,...,,,,,,,,,,Repent JesusLoveToo Repent ~ God is Good ~ Co...
93905,661204,89270,en,1025151908,everyone,['1574062902297784320'],,2022-09-25T15:46:55.000Z,"[{'type': 'retweeted', 'id': '1574053377922711...","RT @aaronjmate: In his Sept. 21 speech, Putin ...",...,,,,,,,,,,agg al xomaggiebattles america first god l...
93932,799223,104922,en,1460063607601995776,everyone,['1577078572786757633'],,2022-10-03T23:30:07.000Z,"[{'type': 'retweeted', 'id': '1577008613167886...",RT @The_Real_Fly: Putin is set to demonstrate ...,...,,,,,,,,,,Louis De Frontenac Louis_De_Buade Governor of ...
94292,703150,8849,en,206414832,everyone,['1579474980886433794'],,2022-10-10T14:12:35.000Z,"[{'type': 'retweeted', 'id': '1579466285569961...",RT @AHMalcolm: Joe Biden admits a nuclear #Arm...,...,,,,,,,,,,Top Blog Sites TopInfoBlogs I try to keep a T...


In [4]:
print(f"MAGA-identified profiles: {len(df_maga_profiles)}")
print(f"Percentage of MAGA profiles: {len(df_maga_profiles)/len(df)*100:.2f}%\n")

# Let's look at the distribution of MAGA keywords in the profiles
maga_keyword_counts = {}
for keyword in maga_keywords:
    count = df_maga_profiles['combined_profile'].str.contains(keyword, case=False, na=False).sum()
    if count > 0:
        maga_keyword_counts[keyword] = count

# Sort by frequency
sorted_keywords = sorted(maga_keyword_counts.items(), key=lambda x: x[1], reverse=True)
print("Top 10 MAGA identifiers in profiles:")
for keyword, count in sorted_keywords[:10]:
    print(f"- {keyword}: {count} profiles")

MAGA-identified profiles: 1077
Percentage of MAGA profiles: 1.14%

Top 10 MAGA identifiers in profiles:
- \bmaga\b: 841 profiles
- #maga\b: 281 profiles
- \bamerica\s?first\b: 130 profiles
- \bultra\s?maga\b: 92 profiles
- \btrump\s?2024\b: 75 profiles
- \btrump\s?supporter\b: 72 profiles
- #kag\b: 51 profiles
- #americafirst\b: 47 profiles
- \bsave\s?america\b: 46 profiles
- \btrump\s?won\b: 42 profiles
