In [None]:
import pandas as pd
import ast
import re

# ---- Step 1: Load raw dataset ----
df = pd.read_csv("may_july_chunk_1.csv")
df2 = pd.read_csv("may_july_chunk_15.csv")
df3 = pd.read_csv("may_july_chunk_287.csv")
df4 = pd.read_csv("may_july_chunk_392.csv")

df = pd.concat([df, df2, df3, df4], ignore_index=True)

# ---- Step 2: Extract user_id from 'user' column ----
def extract_user_id(user_str):
    try:
        # Remove datetime objects from the string
        clean_str = re.sub(r"datetime\.datetime\([^)]*\)", "'DATE'", user_str)
        user_dict = ast.literal_eval(clean_str)
        return user_dict.get("id_str", None)
    except Exception:
        return None

df["user_id"] = df["user"].apply(extract_user_id)

# ---- Step 3: Extract mentioned user IDs ----
# ---- Step 3: Extract mentioned user IDs ----
def extract_mentions(mention_str):
    try:
        mentions_list = ast.literal_eval(mention_str)
        if isinstance(mentions_list, list):
            return [m.get("id_str", None) for m in mentions_list if isinstance(m, dict)]
        return []
    except Exception:
        return []

df["mention_ids"] = df["mentionedUsers"].apply(extract_mentions)

# ---- Step 4: Convert epoch ‚Üí proper datetime ----
df["created_at"] = pd.to_datetime(df["epoch"], unit="s")

# ---- Step 5: Flatten viewCount ----
def parse_viewcount(vc_str):
    try:
        vc_dict = ast.literal_eval(vc_str)
        return int(vc_dict.get("count", 0))
    except Exception:
        return 0

if "viewCount" in df.columns:
    df["viewCount"] = df["viewCount"].apply(parse_viewcount)

# ---- Step 6: Select useful columns ----
keep_cols = [
    "user_id", "text", "hashtags", "mention_ids",
    "in_reply_to_user_id_str", "created_at",
    "likeCount", "retweetCount", "replyCount"
]

if "viewCount" in df.columns:
    keep_cols.append("viewCount")

filtered_df = df[keep_cols]

# ---- Step 7: Save cleaned dataset ----
filtered_df.to_csv("filtered_may_july.csv", index=False)

# ---- Step 8: Preview ----
print(filtered_df.head(10))
print("\n‚úÖ Final dataset saved as 'filtered_may_july.csv' and is graph-ready!")


               user_id                                               text  \
0   942869257108455424  @lukepbeasley I cant imagine anyone actually f...   
1  1461100431329796100  Voters can also sway me away from voting  for ...   
2  1655734665955737600  @PoodleHead57 @BobOnderMO Can you name that am...   
3  1771777682587713536  @Morning_Joe @JoeNBC The fact remains that Joe...   
4            874708668  @BidenHQ That's funny you're obviously trying ...   
5           1029117138  #Internacional \n\nLas modificaciones introduc...   
6            402161614  MAGA RAGES Over Hunter Biden Verdict [What!? ....   
7  1667014094627520512  @harryjsisson There is no doubt in my mind tha...   
8  1754711157875552256  @ShaunPMaca @MoneyTalkUS @byHeatherLong Hourly...   
9  1267130998178770944    @nepolabo_maga „Çä„ÇÜ„Åï„Çì‚ô™\n„Åä„ÅØ„ÅÇ„Çä„ÄúÔºÅ\‚Å†(‚Å†‡πë‚Å†‚ïπ‚Å†‚ó°‚Å†‚ïπ‚Å†‡πë‚Å†)‚Å†Ôæâ‚Å†‚ô¨   

                                            hashtags  \
0                                         

In [None]:
# Look at the raw "user" column to see actual structure
print(df["user"].head(5).tolist())


["{'id': 942869257108455424, 'id_str': '942869257108455424', 'url': 'https://twitter.com/B. G. Dalena', 'username': 'B. G. Dalena', 'rawDescription': 'Therapist. Philosophy, Epistemology, Politics, Neuroscience, Golf, Art, Sports, and a relentless pursuit of the truth.', 'created': datetime.datetime(2017, 12, 18, 21, 28, 43, tzinfo=datetime.timezone.utc), 'followersCount': 152, 'friendsCount': 246, 'statusesCount': 390, 'favouritesCount': 19, 'listedCount': 0, 'mediaCount': 1, 'location': 'San Diego, CA', 'profileImageUrl': 'https://pbs.twimg.com/profile_images/1613647617594130432/h3jM9GzR_normal.jpg', 'profileBannerUrl': 'PW', 'protected': 'PW', 'verified': False, 'blue': False, 'blueType': None, 'descriptionLinks': ['PW'], '_type': 'PW'}", "{'id': 1461100431329796100, 'id_str': '1461100431329796100', 'url': 'https://twitter.com/Brandon Holloway', 'username': 'Brandon Holloway', 'rawDescription': 'conservative.  principled. ideology  over any man or Politician.  2A advocate.   I suppo