# Dataset Processing
This notebook contains the process of dataset preprocessing, including the removal of rows with missing or deleted comments, hidden author information, duplicates, and spam entries.



In [None]:
import pandas as pd
from google.colab import drive
import glob
import os

In [None]:
drive.mount("/content/drive", force_remount=True)

input_folder = '/content/drive/MyDrive/Computational Sciences/categorized_data'
output_file = '/content/drive/MyDrive/Computational Sciences/all_data.csv'

all_csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

df_list = [pd.read_csv(f) for f in all_csv_files]
df = pd.concat(df_list, ignore_index=True)
df.to_csv(output_file, index=False)
display(df.head())

Mounted at /content/drive


Unnamed: 0,comment_id,created_utc,author,score,body,subreddit,link_id,permalink,government_mentioned,fire_mentioned,lafd_related,urban_planning_related,fabricated_fires_related,celebrity_related,ai_related,disinformation_related,antisemitic_related,weather_related,conspiracy_mentioned,humanitarian_aid_mentioned
0,m4rw30e,1735690000.0,flicman,-1,And what passive system keeps indoor humidity ...,AskLosAngeles,t3_1hqpfu8,https://www.reddit.com/r/AskLosAngeles/comment...,0,0,0,0,0,0,0,0,0,0,0,0
1,m4rw7e0,1735690000.0,msing,1,I know it's quite common agriculture products ...,AskLosAngeles,t3_1hqoib7,https://www.reddit.com/r/AskLosAngeles/comment...,0,0,0,0,0,0,0,0,0,0,0,0
2,m4rw9de,1735690000.0,Purple-Display-5233,4,Please tell me you have another source for all...,AskLosAngeles,t3_1hqqc71,https://www.reddit.com/r/AskLosAngeles/comment...,0,0,0,0,0,0,0,0,0,0,1,0
3,m4rwsma,1735690000.0,Jealous-Ad-2827,1,You’re right. Living in Southbay it’s unlikely...,AskLosAngeles,t3_1hq8ty3,https://www.reddit.com/r/AskLosAngeles/comment...,1,0,0,0,0,0,1,0,0,0,0,0
4,m4rwyk5,1735690000.0,jazzypakoma,2,I saw Lamar at Beyonces concert last year.,AskLosAngeles,t3_1hq8ty3,https://www.reddit.com/r/AskLosAngeles/comment...,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
rows, columns = df.shape
print(f"The dataset has {rows} rows")

The dataset has 4240757 rows


In [None]:
df = df.drop_duplicates(subset=['comment_id'], keep='first')
rows, columns = df.shape
print(f"The dataset after removing duplicates has {rows} rows")

The dataset after removing duplicates has 4240757 rows


In [None]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
df.rename(columns={'created_utc': 'datetime'}, inplace=True)

df = df.sort_values(by='datetime')
display(df.head())

Unnamed: 0,comment_id,datetime,author,score,body,subreddit,link_id,permalink,government_mentioned,fire_mentioned,lafd_related,urban_planning_related,fabricated_fires_related,celebrity_related,ai_related,disinformation_related,antisemitic_related,weather_related,conspiracy_mentioned,humanitarian_aid_mentioned
3675162,lzsk65x,2024-12-01 00:00:00,Ikeelu,-11,"Not exactly, but it's not new either. Batterie...",news,t3_1h3p9ja,https://www.reddit.com/r/news/comments/1h3p9ja...,0,0,0,0,0,0,1,0,0,0,0,0
3248355,lzsk6lh,2024-12-01 00:00:04,[deleted],1,[removed],AskLosAngeles,t3_1h3jr6l,https://www.reddit.com/r/AskLosAngeles/comment...,0,0,0,0,0,0,0,0,0,0,0,0
3675163,lzsk6lq,2024-12-01 00:00:04,[deleted],1,[removed],news,t3_1h3av7i,https://www.reddit.com/r/news/comments/1h3av7i...,0,0,0,0,0,0,0,0,0,0,0,0
3675164,lzsk6pb,2024-12-01 00:00:05,[deleted],1,[removed],news,t3_1h3p9ja,https://www.reddit.com/r/news/comments/1h3p9ja...,0,0,0,0,0,0,0,0,0,0,0,0
3675165,lzsk77o,2024-12-01 00:00:10,[deleted],1,[removed],news,t3_1h3p9ja,https://www.reddit.com/r/news/comments/1h3p9ja...,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df.dtypes

Unnamed: 0,0
comment_id,object
datetime,datetime64[ns]
author,object
score,int64
body,object
subreddit,object
link_id,object
permalink,object
government_mentioned,int64
fire_mentioned,int64


In [None]:
#columns that can't allow absent values
critical_columns = ['comment_id', 'author', 'body', 'subreddit', 'link_id', 'permalink']
df = df.dropna(subset=critical_columns)


print(f"The dataset after removing rows with missing values has {len(df)} rows")

The dataset after removing rows with missing values has 4240737 rows


In [None]:
#converting categorized columns into boolean type
category_columns = ['government_mentioned', 'fire_mentioned', 'lafd_related',
                       'urban_planning_related', 'fabricated_fires_related',
                       'celebrity_related', 'ai_related', 'disinformation_related',
                       'antisemitic_related', 'weather_related',
                       'conspiracy_mentioned', 'humanitarian_aid_mentioned']

df.loc[:, category_columns] = df.loc[:, category_columns].fillna(False).astype(bool)

In [None]:
df.dtypes

Unnamed: 0,0
comment_id,object
datetime,datetime64[ns]
author,object
score,int64
body,object
subreddit,object
link_id,object
permalink,object
government_mentioned,bool
fire_mentioned,bool


In [None]:
#cleaning remowed or deleted comments/authors

df = df[
    (df['author'] != '[deleted]') &
    (~df['body'].isin(['[removed]', '[deleted]']))
]
print(f"The dataset after cleaning remowed or deleted comments/authors has {len(df)} rows")

The dataset after cleaning remowed or deleted comments/authors has 3618524 rows


In [None]:
#remowing short comments/spam

df = df[
    df['body'].apply(lambda x: isinstance(x, str) and len(x.strip()) >= 5)
]
len(df)

print(f"The dataset after cleaning spam has {len(df)} rows")

The dataset after cleaning spam has 3587384 rows


In [None]:
output_csv_path = '/content/drive/MyDrive/Computational Sciences/cleaned_all_data.csv'
df.to_csv(output_csv_path, index=False)