-----------

# READING AND CLEANING DATA FROM ACADEMIC TORRENTS

### IMPORTS AND FUNCTIONS

In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# UNCOMMENT IF THE networkx package is not installed (not necessary for main analysis)
# !pip install networkx
import networkx as nx

In [56]:
def clean_reddit_data(df):
    """
    Input: raw dataframe from academic torrents
    Output: clean dataframe, with no missing.

    """

    # GETTING POST ID
    df["post_id"] = df["URL"].apply(lambda x: "".join(re.findall("(?<=comments\/)\w+(?=\/)",x))).copy()
    df.URL = df.URL.apply(lambda x: "".join(re.findall(r"r\/\w+",x)))

    #REMOVE MISSING VALUES AND TAKE ONLY DATA FROM OUR SUBREDDITS
    list_subreddit = ["r/Judaism","r/IsraelPalestine","r/Israel","r/Palestine","r/Jewish"]
    df = df[
            (df["URL"].isin(list_subreddit))
            & (df["text"] != "[removed]")
            & (df["text"] != "[deleted]") 
            & (~df["text"].isna())
            ].drop_duplicates()
    # REMOVE POSTS WHICH CONTAIN ONLY IMAGES OR URLS
    df = df[(df.text.str.contains(" ")) & (~df.text.str.contains("https://"))]

    # RENAME COLUMNS
    df = df.rename(columns= {"URL": "subreddit"})

    return df.sort_values("date").reset_index(drop = True)

In [57]:
def clean_reddit_data_rem_del(df):
    """
    Input: raw dataframe from academic torrents
    Output: clean dataframe which contains also removed and deleted posts for visualization purposes.

    """

    # GETTING POST ID
    df["post_id"] = df["URL"].apply(lambda x: "".join(re.findall("(?<=comments\/)\w+(?=\/)",x))).copy()
    df.URL = df.URL.apply(lambda x: "".join(re.findall(r"r\/\w+",x)))

    #REMOVE MISSING VALUES AND TAKE ONLY DATA FROM OUR SUBREDDITS
    list_subreddit = ["r/Judaism","r/IsraelPalestine","r/Israel","r/Palestine","r/Jewish"]
    df = df[
            (df["URL"].isin(list_subreddit))
            ].drop_duplicates()
    # REMOVE POSTS WHICH CONTAIN ONLY IMAGES OR URLS
    df.text = df.text.fillna("NaN")
    # RENAME COLUMNS
    df = df.rename(columns= {"URL": "subreddit"})

    return df.sort_values("date").reset_index(drop = True)

### READING IN THE DATA AND CLEANING

In [58]:
list_dataframes_23 = [clean_reddit_data(pd.read_csv(f"Reddit0{x}23.csv")) if x < 10 else clean_reddit_data(pd.read_csv(f"Reddit{x}23.csv")) for x in range(7,13)]
list_dataframes_24 = [clean_reddit_data(pd.read_csv(f"Reddit0{x}24.csv")) for x in range(1,5)]
list_dataframes = list_dataframes_23 + list_dataframes_24

In [59]:
final = pd.concat(list_dataframes, axis = 0, ignore_index = True).reset_index(drop = True)

In [159]:
#TO SAVE
final.to_csv("RedditSubmissionsClean.csv")

## VISUALIZATION

### Removed and Deleted

In [61]:
# READ THE DATASETS AGAIN WITHOUT REMOVING DELETED AND REMOVED SUBMISSIONS IN ORDER TO VISUALIZE
list_dataframes_23_2 = [clean_reddit_data_rem_del(pd.read_csv(f"Reddit0{x}23.csv")) if x < 10 else clean_reddit_data_rem_del(pd.read_csv(f"Reddit{x}23.csv")) for x in range(7,13)]
list_dataframes_24_2 = [clean_reddit_data_rem_del(pd.read_csv(f"Reddit0{x}24.csv")) for x in range(1,5)]
list_dataframes_2 = list_dataframes_23_2 + list_dataframes_24_2

final_2 = pd.concat(list_dataframes_2, axis = 0, ignore_index = True).reset_index(drop = True)
subset = ["r/Israel","r/IsraelPalestine","r/Palestine"]
removed = pd.DataFrame(final_2[((final_2.text == "[removed]") | (final_2.text == "[deleted]")) & (final_2.subreddit.isin(subset))].groupby(["subreddit","text"])["post_id"].count()).rename(columns = {"post_id":"Number of Deleted and Removed Posts"})
removed.to_latex("table2.tex")
removed

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Deleted and Removed Posts
subreddit,text,Unnamed: 2_level_1
r/Israel,[deleted],52
r/Israel,[removed],4044
r/IsraelPalestine,[deleted],22
r/IsraelPalestine,[removed],9273
r/Palestine,[deleted],15
r/Palestine,[removed],8773


### Total posts

In [5]:
final.groupby("subreddit")["text"].apply("count").reset_index().rename(columns = {"text" : "Number of Total Posts"})

Unnamed: 0,subreddit,Number of Total Posts
0,r/Israel,5960
1,r/IsraelPalestine,6667
2,r/Jewish,4666
3,r/Judaism,7307
4,r/Palestine,1412
