In [None]:
# load libraries
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

## Comments

In [None]:
# put the path to the input file, or a folder of files to process all of
input_folder = r"/Volumes/Untitled/reddit/subreddits23"
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
output_folder = r"/Volumes/Untitled/reddit/subreddits23_csv"
# the format to output in, pick from the following options
#   zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
#   txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
#   csv: a comma separated value file. Can be opened by a text editor or excel
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
output_format = "csv"
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
# any field that's in the dump is supported, but useful ones are
#   author: the username of the author
#   id: the id of the submission or comment
#   link_id: only for comments, the fullname of the submission the comment is associated with
#   parent_id: only for comments, the fullname of the parent of the comment. Either another comment or the submission if it's top level
single_field = None
# the fields in the file are different depending on whether it has comments or submissions. If we're writing a csv, we need to know which fields to write.
# set this to true to write out to the log every time there's a bad line, set to false if you're expecting only some of the lines to match the key
write_bad_lines = True

# only output items between these two dates
from_date = datetime.strptime("2023-01-01", "%Y-%m-%d")
to_date = datetime.strptime("2023-12-31", "%Y-%m-%d")

field = "body"
values = ['']
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
# if this list is very large, it could greatly slow down the process
values_file = None
exact_match = False

# sets up logging to the console as well as a file
log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)


def write_line_zst(handle, line):
	handle.write(line.encode('utf-8'))
	handle.write("\n".encode('utf-8'))


def write_line_json(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")


def write_line_single(handle, obj, field):
	if field in obj:
		handle.write(obj[field])
	else:
		log.info(f"{field} not in object {obj['id']}")
	handle.write("\n")


def write_line_csv(writer, obj, is_submission):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d"))
    if is_submission:
        output_list.append(obj['title'])
        output_list.append(obj.get('num_comments', 0))  # Include num_comments for submissions
    else:
        output_list.append(obj.get('is_submitter', False))  # Include is_submitter for comments
    output_list.append(f"u/{obj['author']}")
    output_list.append(f"https://www.reddit.com{obj['permalink']}")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                output_list.append(obj['selftext'])
            else:
                output_list.append("")
        else:
            output_list.append(obj['url'])
    else:
        output_list.append(obj['body'])
    writer.writerow(output_list)


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()


def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match):
	output_path = f"{output_file}.{output_format}"
	is_submission = "submission" in input_file
	log.info(f"Input: {input_file} : Output: {output_path} : Is submission {is_submission}")
	writer = None
	if output_format == "zst":
		handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
	elif output_format == "txt":
		handle = open(output_path, 'w', encoding='UTF-8')
	elif output_format == "csv":
		handle = open(output_path, 'w', encoding='UTF-8', newline='')
		writer = csv.writer(handle)
	else:
		log.error(f"Unsupported output format {output_format}")
		sys.exit()

	file_size = os.stat(input_file).st_size
	created = None
	matched_lines = 0
	bad_lines = 0
	total_lines = 0
	for line, file_bytes_processed in read_lines_zst(input_file):
		total_lines += 1
		if total_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))

			if created < from_date:
				continue
			if created > to_date:
				continue

			if field is not None:
				field_value = obj[field].lower()
				matched = False
				for value in values:
					if exact_match:
						if value == field_value:
							matched = True
							break
					else:
						if value in field_value:
							matched = True
							break
				if not matched:
					continue

			matched_lines += 1
			if output_format == "zst":
				write_line_zst(handle, line)
			elif output_format == "csv":
				write_line_csv(writer, obj, is_submission)
			elif output_format == "txt":
				if single_field is not None:
					write_line_single(handle, obj, single_field)
				else:
					write_line_json(handle, obj)
			else:
				log.info(f"Something went wrong, invalid output format {output_format}")
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
			if write_bad_lines:
				if isinstance(err, KeyError):
					log.warning(f"Key {field} is not in the object: {err}")
				elif isinstance(err, json.JSONDecodeError):
					log.warning(f"Line decoding failed: {err}")
				log.warning(line)

	handle.close()
	log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}")


if __name__ == "__main__":
	if single_field is not None:
		log.info("Single field output mode, changing output file format to txt")
		output_format = "txt"

	if values_file is not None:
		values = []
		with open(values_file, 'r') as values_handle:
			for value in values_handle:
				values.append(value.strip().lower())
		log.info(f"Loaded {len(values)} from values file {values_file}")
	else:
		values = [value.lower() for value in values]  # convert to lowercase

	log.info(f"Filtering field: {field}")
	if len(values) <= 20:
		log.info(f"On values: {','.join(values)}")
	else:
		log.info(f"On values:")
		for value in values:
			log.info(value)
	log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}.")
	log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
	log.info(f"Output format set to {output_format}")

for filename in os.listdir(input_folder):
    try:
        if not filename.startswith("._") and filename.endswith(".zst") and "comments" in filename:  # Process only zst files
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0])  # Use output_folder here
            process_file(input_file_path, output_file_path,output_format, field, values, from_date, to_date, single_field, exact_match)
        else:
            continue  # Skip files that are not comments files
    except Exception as e:
        log.error(f"Error processing file {filename}: {e}")
        continue  # Move on to the next file

## Submissions

In [None]:
# put the path to the input file, or a folder of files to process all of
input_folder = r"/Volumes/Untitled/reddit/subreddits23"
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
output_folder = r"/Volumes/Untitled/reddit/subreddits23_csv"
# the format to output in, pick from the following options
#   zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
#   txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
#   csv: a comma separated value file. Can be opened by a text editor or excel
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
output_format = "csv"
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
# any field that's in the dump is supported, but useful ones are
#   author: the username of the author
#   id: the id of the submission or comment
#   link_id: only for comments, the fullname of the submission the comment is associated with
#   parent_id: only for comments, the fullname of the parent of the comment. Either another comment or the submission if it's top level
single_field = None
# the fields in the file are different depending on whether it has comments or submissions. If we're writing a csv, we need to know which fields to write.
# set this to true to write out to the log every time there's a bad line, set to false if you're expecting only some of the lines to match the key
write_bad_lines = True

# only output items between these two dates
from_date = datetime.strptime("2023-01-01", "%Y-%m-%d")
to_date = datetime.strptime("2023-12-31", "%Y-%m-%d")

field = "selftext"
values = ['']
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
# if this list is very large, it could greatly slow down the process
values_file = None
exact_match = False

# sets up logging to the console as well as a file
log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)


def write_line_zst(handle, line):
	handle.write(line.encode('utf-8'))
	handle.write("\n".encode('utf-8'))


def write_line_json(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")


def write_line_single(handle, obj, field):
	if field in obj:
		handle.write(obj[field])
	else:
		log.info(f"{field} not in object {obj['id']}")
	handle.write("\n")


def write_line_csv(writer, obj, is_submission):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d"))
    if is_submission:
        output_list.append(obj['title'])
        output_list.append(obj.get('num_comments', 0))  # Include num_comments for submissions
    else:
        output_list.append(obj.get('is_submitter', False))  # Include is_submitter for comments
    output_list.append(f"u/{obj['author']}")
    output_list.append(f"https://www.reddit.com{obj['permalink']}")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                output_list.append(obj['selftext'])
            else:
                output_list.append("")
        else:
            output_list.append(obj['url'])
    else:
        output_list.append(obj['body'])
    writer.writerow(output_list)


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()


def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match):
	output_path = f"{output_file}.{output_format}"
	is_submission = "submission" in input_file
	log.info(f"Input: {input_file} : Output: {output_path} : Is submission {is_submission}")
	writer = None
	if output_format == "zst":
		handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
	elif output_format == "txt":
		handle = open(output_path, 'w', encoding='UTF-8')
	elif output_format == "csv":
		handle = open(output_path, 'w', encoding='UTF-8', newline='')
		writer = csv.writer(handle)
	else:
		log.error(f"Unsupported output format {output_format}")
		sys.exit()

	file_size = os.stat(input_file).st_size
	created = None
	matched_lines = 0
	bad_lines = 0
	total_lines = 0
	for line, file_bytes_processed in read_lines_zst(input_file):
		total_lines += 1
		if total_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))

			if created < from_date:
				continue
			if created > to_date:
				continue

			if field is not None:
				field_value = obj[field].lower()
				matched = False
				for value in values:
					if exact_match:
						if value == field_value:
							matched = True
							break
					else:
						if value in field_value:
							matched = True
							break
				if not matched:
					continue

			matched_lines += 1
			if output_format == "zst":
				write_line_zst(handle, line)
			elif output_format == "csv":
				write_line_csv(writer, obj, is_submission)
			elif output_format == "txt":
				if single_field is not None:
					write_line_single(handle, obj, single_field)
				else:
					write_line_json(handle, obj)
			else:
				log.info(f"Something went wrong, invalid output format {output_format}")
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
			if write_bad_lines:
				if isinstance(err, KeyError):
					log.warning(f"Key {field} is not in the object: {err}")
				elif isinstance(err, json.JSONDecodeError):
					log.warning(f"Line decoding failed: {err}")
				log.warning(line)

	handle.close()
	log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}")


if __name__ == "__main__":
	if single_field is not None:
		log.info("Single field output mode, changing output file format to txt")
		output_format = "txt"

	if values_file is not None:
		values = []
		with open(values_file, 'r') as values_handle:
			for value in values_handle:
				values.append(value.strip().lower())
		log.info(f"Loaded {len(values)} from values file {values_file}")
	else:
		values = [value.lower() for value in values]  # convert to lowercase

	log.info(f"Filtering field: {field}")
	if len(values) <= 20:
		log.info(f"On values: {','.join(values)}")
	else:
		log.info(f"On values:")
		for value in values:
			log.info(value)
	log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}.")
	log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
	log.info(f"Output format set to {output_format}")

for filename in os.listdir(input_folder):
    try:
        if not filename.startswith("._") and filename.endswith(".zst") and "submissions" in filename:  # Process only zst files
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0])  # Use output_folder here
            process_file(input_file_path, output_file_path,output_format, field, values, from_date, to_date, single_field, exact_match)
        else:
            continue  # Skip files that are not comments files
    except Exception as e:
        log.error(f"Error processing file {filename}: {e}")
        continue  # Move on to the next file

In [43]:
# find zero byte files in subreddits23_csv and save as list 
import os
import csv

path = r"/Volumes/Untitled/reddit/subreddits23_csv"
zero_byte_files = []
for filename in os.listdir(path):
    if os.path.getsize(os.path.join(path, filename)) == 0:
        zero_byte_files.append(filename)
zero_byte_files

# remove ".csv" from the end of each file name
zero_byte_files = [file[:-4] for file in zero_byte_files]



In [44]:
zero_byte_files

['badukpolitics_comments',
 'bbcnewsuk_comments',
 'BritishNationalism_comments',
 'CCTVCamerasUK_comments',
 'DailyMail_comments',
 'fakeIDUK_comments',
 'thebritishelites_comments',
 'Ukhempflowers_comments',
 'ukpolitics_comments',
 'UKTVLAND_comments',
 'UKNewsByABot_submissions',
 'uk_news_today_submissions',
 'badukpolitics_submissions',
 'bbcnewsuk_submissions',
 'BritishNationalism_submissions',
 'CCTVCamerasUK_submissions',
 'DailyMail_submissions',
 'fakeIDUK_submissions',
 'thebritishelites_submissions',
 'Ukhempflowers_submissions',
 'UKTVLAND_submissions']

In [45]:
len(zero_byte_files)

21

For these files, I need to go through the terminal to decompress them

In [46]:
# printing commands to copy paste into terminal 
for file in zero_byte_files:
    print(f"zstd -d /Volumes/Untitled/reddit/subreddits23/{file}.zst -o {file}.txt")

zstd -d /Volumes/Untitled/reddit/subreddits23/badukpolitics_comments.zst -o badukpolitics_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/bbcnewsuk_comments.zst -o bbcnewsuk_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/BritishNationalism_comments.zst -o BritishNationalism_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/CCTVCamerasUK_comments.zst -o CCTVCamerasUK_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/DailyMail_comments.zst -o DailyMail_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/fakeIDUK_comments.zst -o fakeIDUK_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/thebritishelites_comments.zst -o thebritishelites_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/Ukhempflowers_comments.zst -o Ukhempflowers_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/ukpolitics_comments.zst -o ukpolitics_comments.txt
zstd -d /Volumes/Untitled/reddit/subreddits23/UKTVLAND_comments.zst -o UKTVLAND_comments.

In [47]:
# printing commands to copy paste into terminal and move files to zerobyte_files folder
for file in zero_byte_files:
    print(f"mv ~/{file}.txt /Volumes/Untitled/reddit/zerobyte_files/")


mv ~/badukpolitics_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/bbcnewsuk_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/BritishNationalism_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/CCTVCamerasUK_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/DailyMail_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/fakeIDUK_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/thebritishelites_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/Ukhempflowers_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/ukpolitics_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/UKTVLAND_comments.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/UKNewsByABot_submissions.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/uk_news_today_submissions.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/badukpolitics_submissions.txt /Volumes/Untitled/reddit/zerobyte_files/
mv ~/bbcnewsuk_submissions.txt /Volumes/Untitled/reddit/

In [None]:
### Convert zero byte files to CSV
import json
import csv
import os
from datetime import datetime

# Define the directory containing the .txt files and the output directory
input_directory = "/Volumes/Untitled/reddit/zerobyte_files/"
output_directory = "/Volumes/Untitled/reddit/zerobyte_files_csv/"

# Define the fields to include in the CSV
csv_fields = [
    "score", 
    "created_utc", 
    "author", 
    "permalink", 
    "body", 
    "is_submitter", 
    "num_comments", 
    "title", 
    "url"
]

def parse_json_line(line):
    try:
        obj = json.loads(line)
        created = datetime.utcfromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d %H:%M:%S")
        
        csv_row = {
            "score": obj.get("score", ""),
            "created_utc": created,
            "author": f"u/{obj.get('author', '')}",
            "permalink": f"https://www.reddit.com{obj.get('permalink', '')}",
            "body": obj.get("body", ""),
            "is_submitter": obj.get("is_submitter", ""),
            "num_comments": obj.get("num_comments", ""),
            "title": obj.get("title", ""),
            "url": obj.get("url", "")
        }
        
        return csv_row
    except json.JSONDecodeError:
        return None

def read_and_convert(input_file, output_file):
    with open(input_file, 'r', encoding='latin-1') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=csv_fields)
        writer.writeheader()
        
        for line in infile:
            csv_row = parse_json_line(line.strip())
            if csv_row:
                writer.writerow(csv_row)

# Process all .txt files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_directory, filename)
        output_file_name = filename.replace(".txt", ".csv")
        output_file_path = os.path.join(output_directory, output_file_name)
        
        print(f"Processing {input_file_path} to {output_file_path}")
        read_and_convert(input_file_path, output_file_path)

print("All files processed.")


In [None]:
import os
import pandas as pd

input_path = r"/Volumes/Untitled/reddit/subreddits23_csv"
output_path = r"/Volumes/Untitled/reddit/cleaned_subreddits"

for filename in os.listdir(input_path):
    file_path = os.path.join(input_path, filename)
    
    # skip processing if the file is zero bytes (separate cleaning process for these files)
    if os.path.getsize(file_path) == 0:
        continue

    if "comments" in filename:
        column_names = ['score', 'date', 'is_submitter', 'user', 'link', 'body']
    else:
        # column names for submissions
        column_names = ['score', 'date', 'title', 'num_comments', 'user', 'link', 'body']
    
    # read csv file with specified column names
    df = pd.read_csv(file_path, names=column_names, header=None, encoding='ISO-8859-1')
    
    # check if all columns are present
    if not all(col in df.columns for col in column_names):
        print(f"Skipping {filename} due to missing columns")
        continue

    if "comments" in filename:
        # add a column for is_comment
        df['is_comment'] = 1
    else:
        # add a column for is_comment
        df['is_comment'] = 0
    
    # remove rows where 'user' is "u/[deleted]"
    df = df[~df['user'].isin(["u/[deleted]"])]
    
    # remove rows where 'body' is "[deleted]", "[removed]", or empty
    df = df[~df['body'].isin(["[deleted]", "[removed]", ""])]
    
    # remove rows where 'body' has only one word
    df = df[df['body'].apply(lambda x: len(str(x).split()) > 1)]
    
    # change 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # add a column for subreddit
    df['subreddit'] = filename.split("_")[0]
    
    # save cleaned file to cleaned_subreddits folder
    cleaned_file_path = os.path.join(output_path, filename)
    df.to_csv(cleaned_file_path, index=False)
    print(f"Saved {filename} to {cleaned_file_path}")

In [48]:
import os
import pandas as pd

## now to handle files in zero_bytes folder
## they have 0kb in subreddit23_csv folder and were processed separately and moved to zero_bytes folder
input_path = r"/Volumes/Untitled/reddit/zerobyte_files"
output_path = r"/Volumes/Untitled/reddit/cleaned_subreddits"
csv_fields = [
    "score", 
    "created_utc", 
    "author", 
    "permalink", 
    "body", 
    "is_submitter", 
    "num_comments", 
    "title", 
    "url"
]
for filename in os.listdir(input_path):
    file_path = os.path.join(input_path, filename)
    df = pd.read_csv(file_path, names= csv_fields, encoding='utf-8')
    if "comments" in filename:
        # drop uneccessary columns
        df.drop(columns=["is_submitter", "num_comments", "title", "url"], inplace=True)
        df.columns = ['score', 'date', 'user', 'link', 'body']
    else:
        # drop uneccessary columns
        df.drop(columns=["body", "is_submitter"], inplace=True)
        # column names for submissions
        df.columns = ['score', 'date', 'user', 'link', 'num_comments', 'title', 'body']
    
    # add a column for is_comment
    if "comments" in filename:
        # add a column for is_comment
        df['is_comment'] = 1
    else:
        # add a column for is_comment
        df['is_comment'] = 0
    
    # add a column for subreddit
    df['subreddit'] = filename.split("_")[0]
    
    # remove rows where 'user' is "u/[deleted]"
    df = df[~df['user'].isin(["u/[deleted]"])]

    # remove rows where 'body' is "[deleted]", "[removed]", or empty
    df = df[~df['body'].isin(["[deleted]", "[removed]", ""])]

    # remove rows where 'body' has only one word
    df = df[df['body'].apply(lambda x: len(str(x).split()) > 1)]

    # change 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # save cleaned file to cleaned_subreddits folder
    cleaned_file_path = os.path.join(output_path, filename)
    df.to_csv(cleaned_file_path, index=False)
    
    print(f"Saved {filename} to {cleaned_file_path}")
    

Saved thebritishelites_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits/thebritishelites_comments.csv
Saved ._thebritishelites_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits/._thebritishelites_comments.csv
Saved thebritishelites_submissions.csv to /Volumes/Untitled/reddit/cleaned_subreddits/thebritishelites_submissions.csv
Saved ._thebritishelites_submissions.csv to /Volumes/Untitled/reddit/cleaned_subreddits/._thebritishelites_submissions.csv


## Filtering comments about immigration
- had to generate exhaustive list of keywords using GPT4
- had to respecify column names based on whether file was a comment or submission
- specified body as a string type and moved to first index position (because body is in different positions depending on csv so to make it consistent when subsetting based on column value)
- filtered based on keywords and saved  

In [60]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.")

input_path = r"/Volumes/Untitled/reddit/cleaned_subreddits1"
output_path = r"/Volumes/Untitled/reddit/immigration_subreddits"

# Define immigration-related keywords, including terms about "stop the boat" and people coming via boats
immigration_keywords = (
    r'\bimmigrat\w*\b|'
    r'\bRwanda\s(Bill|Policy)\b|'
    r'\b(asylum|refugee|asylum-seeker|refugees|asylum-seekers)\b|'
    r'\bvisa\w*\b|'
    r'\b(undocumented|illegal)\simmigrant\w*\b|'
    r'\b(deportation|detain\w*|detention)\b|'
    r'\b(border\scontrol|immigration\spolicy|migration\spolicy)\b|'
    r'\bpoints-based\ssystem\b|'
    r'\b(skilled\sworker\svisa|student\svisa)\b|'
    r'\b(overstay\w*|overstayer\w*)\b|'
    r'\b(work\svisa|family\svisa|spouse\svisa)\b|'
    r'\b(settlement|permanent\sresidence|PR\sstatus)\b|'
    r'\b(hostile\senvironment)\b|'
    r'\b(integration|assimilation|multiculturalism)\b|'
    r'\b(naturalization|citizenship)\b|'
    r'\b(migrant\w*|expat\w*)\b|'
    r'\b(foreigner\w*|foreign\sworker\w*)\b|'
    r'\b(home\soffice)\b|'
    r'\b(Windrush)\b|'
    r'\b(human\srights|amnesty|appeal)\b|'
    r'\b(brexit\simmigration)\b|'
    r'\b(Ukraine\srefugee\w*|Syrian\srefugee\w*|Afghan\srefugee\w*|Palestinian\srefugee\w*|Iranian\srefugee\w*|Sudanese\srefugee\w*)\b|'
    r'\b(immigration\scourt|tribunal)\b|'
    r'\b(sponsor\w*\svisa)\b|'
    r'\b(protection\sclaim)\b|'
    r'\b(temporary\sprotection|humanitarian\sprotection)\b|'
    r'\b(resettlement\sscheme|community\ssponsorship)\b|'
    r'\b(legal\simmigration|illegal\simmigration)\b|'
    r'\b(immigration\scontrol)\b|'
    r'\b(stop\sthe\sboats?)\b|'
    r'\b(boat\smigrants?)\b|'
    r'\b(small\sboats?)\b|'
    r'\b(channel\scrossings?)\b|'
    r'\b(illegal\sboat\smigration)\b|'
    r'\b(intercepting\sboats?)\b|'
    r'\b(migrant\sboats?)\b|'
    r'\b(people\ssmugglers?)\b'
)


for filename in os.listdir(input_path):
    file_path = os.path.join(input_path, filename)
    
    # Read the CSV file
    df = pd.read_csv(file_path, encoding= 'utf-8', lineterminator='\n')

    if "comments" in filename:
        column_names = ['score', 'date', 'is_submitter', 'user', 'link', 'body', 'is_comment', 'subreddit']
    else:
        # column names for submissions
        column_names = ['score', 'date', 'title', 'num_comments', 'user', 'link', 'body', 'is_comment', 'subreddit']

    # Set column names, reorder body to first column
    df = df.reindex(columns=column_names)
    df = df[['body'] + [col for col in df.columns if col != 'body']]

    # Ensure 'body' is a string type (considering potential issues)
    if not pd.api.types.is_string_dtype(df['body']):
        try:
            # Attempt conversion to string, handling potential errors
            df['body'] = df['body'].astype(str)
        except (ValueError, TypeError):  # Catch specific errors for robustness
            # Handle non-convertible values (e.g., log a message or fill with NaNs)
            print(f"Warning: Encountered non-string values in 'body' column for {filename}.")

    # Filter based on immigration keywords
    df_filtered = df[df['body'].str.contains(immigration_keywords, case=False, na=False)]

    # Print the current file being processed
    print(f"Processing file: {filename}")

    # Save
    output_file_path = os.path.join(output_path, filename)
    df_filtered.to_csv(output_file_path, index=False)
    print(f"Saved {filename} to {output_file_path}")


Processing file: leicester_comments.csv
Saved leicester_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/leicester_comments.csv
Processing file: ._leicester_comments.csv
Saved ._leicester_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/._leicester_comments.csv
Processing file: ukeducation_comments.csv
Saved ukeducation_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/ukeducation_comments.csv
Processing file: ._ukeducation_comments.csv
Saved ._ukeducation_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/._ukeducation_comments.csv
Processing file: medicalschooluk_comments.csv
Saved medicalschooluk_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/medicalschooluk_comments.csv
Processing file: ._medicalschooluk_comments.csv
Saved ._medicalschooluk_comments.csv to /Volumes/Untitled/reddit/immigration_subreddits/._medicalschooluk_comments.csv
Processing file: apprenticeuk_comments.csv
Saved apprenticeuk_comments.csv t

In [55]:
import os
import shutil

input_path = r"/Volumes/Untitled/reddit/cleaned_subreddits"
immigration_subreddits_path = r"/Volumes/Untitled/reddit/immigration_subreddits"
output_path = r"/Volumes/Untitled/reddit/cleaned_subreddits1"

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

for filename in os.listdir(input_path):
    if filename not in os.listdir(immigration_subreddits_path):
        source_file_path = os.path.join(input_path, filename)
        destination_file_path = os.path.join(output_path, filename)
        
        # Copy the file to the output directory
        shutil.copy(source_file_path, destination_file_path)
        print(f"Copied {filename} to {output_path}")

Copied leicester_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._leicester_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ukeducation_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._ukeducation_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied medicalschooluk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._medicalschooluk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied apprenticeuk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._apprenticeuk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied AskABrit_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._AskABrit_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied baduk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied ._baduk_comments.csv to /Volumes/Untitled/reddit/cleaned_subreddits1
Copied bbuk_comments.csv to /Volumes/Unt

In [61]:
## merge all files in immigration_subreddits folder
import os
import pandas as pd

input_path = r"/Volumes/Untitled/reddit/immigration_subreddits"
output_path = r"/Volumes/Untitled/reddit"

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Iterate over the files in the input directory

for filename in os.listdir(input_path):
    file_path = os.path.join(input_path, filename)
    
    # Read the CSV file
    df = pd.read_csv(file_path, encoding='utf-8', lineterminator='\n')
    
    # Append the data to the merged DataFrame
    merged_df = pd.concat([merged_df, df])

  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd.concat([merged_df, df])
  merged_df = pd

In [64]:
# Save the merged data to a CSV file
merged_df.to_csv(r"/Volumes/Untitled/reddit/all_reddit.csv", index=False)
