#Data Extraction

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install zstandard pandas



In [3]:
import zstandard as zstd
import json, sys
import pandas as pd

input_path = "/content/drive/MyDrive/Conservative_comments.zst"
output_csv = "/content/drive/MyDrive/conservative_sept_nov_2024_sample.csv"

start_ts = 1725148800 # 2024-09-01 00:00 UTC
end_ts = 1733011199 # 2024-11-30 23:59 UTC
max_records = 1000000
chunk_size = 2**24

records = []
with open(input_path, 'rb') as fh:
    dctx = zstd.ZstdDecompressor(max_window_size=2**31)
    stream_reader = dctx.stream_reader(fh)
    buffer = b''
    while True:
        chunk = stream_reader.read(chunk_size)
        if not chunk:
            break
        buffer += chunk
        lines = buffer.split(b'\n')
        buffer = lines[-1]
        for line in lines[:-1]:
            if not line:
                continue
            try:
                obj = json.loads(line)
                ts_raw = obj.get("created_utc", 0)
                ts = int(float(ts_raw)) if ts_raw else 0
            except Exception:
                continue
            if start_ts <= ts <= end_ts:
                records.append({
                    "author": obj.get("author"),
                    "link_id": obj.get("link_id"),
                    "created_utc": ts,
                    "body": obj.get("body"),
                    "score": obj.get("score")
                })
            if len(records) % 50000 == 0 and len(records) > 0:
                print(f"{len(records)} comments collected so far...", flush=True)
            if len(records) >= max_records:
                print(f"Reached {max_records} comments. Stopping early.", flush=True)
                break
        if len(records) >= max_records:
            break

df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
print(f"Saved {len(df)} comments to {output_csv}")


50000 comments collected so far...
100000 comments collected so far...
150000 comments collected so far...
200000 comments collected so far...
250000 comments collected so far...
300000 comments collected so far...
350000 comments collected so far...
400000 comments collected so far...
450000 comments collected so far...
500000 comments collected so far...
Saved 533155 comments to /content/drive/MyDrive/conservative_sept_nov_2024_sample.csv


In [4]:
df.head()

Unnamed: 0,author,link_id,created_utc,body,score
0,[deleted],t3_1f5l0wc,1725148800,[removed],1
1,[deleted],t3_1f5ztzq,1725148829,[removed],1
2,TheVREnthusiast2,t3_1f5ztzq,1725148830,She is. Yeah. I mean what do you want me to sa...,-9
3,[deleted],t3_1f5pxv2,1725148837,[removed],1
4,[deleted],t3_1f5ztzq,1725148878,[removed],1


In [5]:
df.shape

(533155, 5)

In [6]:
df_clean = df[(df["body"].notna()) & (df["author"].notna()) & (~df["body"].isin(["[removed]", "[deleted]"])) & (~df["author"].isin(["[deleted]", "AutoModerator"]))]

print(f"Original rows: {len(df)}")
print(f"After cleaning: {len(df_clean)}")

Original rows: 533155
After cleaning: 219929


In [7]:
df_clean

Unnamed: 0,author,link_id,created_utc,body,score
2,TheVREnthusiast2,t3_1f5ztzq,1725148830,She is. Yeah. I mean what do you want me to sa...,-9
5,Dr_Juice_,t3_1f5xyvv,1725148886,Remember when we all said that on Jan 7th?,5
8,AnonPlzzzzzz,t3_1f5ztzq,1725148911,No lies detected.,-9
10,intelligentreviews,t3_1f5xyvv,1725148926,Insurrection!,11
15,Evening_Flatworm5850,t3_1f5y6wv,1725149124,Drink up Nancy,5
...,...,...,...,...,...
533135,NashEast65,t3_1h3pbhk,1733010927,"Nah, she is Webb Hubbleâ€™s love child.",163
533139,Rare_Hydrogen,t3_1h3pbhk,1733010954,It was supposed to be her turn.,36
533140,Far-prophet,t3_1h3pryc,1733010954,![gif](giphy|TU0YWTjo2e208),918
533147,Ok-Instruction830,t3_1h3pbhk,1733011103,ðŸ¤“ she won the popular vote ðŸ¤“,51


In [10]:
from google.colab import files

df_clean.to_csv("/content/drive/MyDrive/conservative_sept_nov_2024_clean.csv", index=False)
files.download("/content/drive/MyDrive/conservative_sept_nov_2024_clean.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>