# Install and Load

In [27]:
# Installing libraries
## matplotlib inline
## pip install emoji
## pip install bertopic
## pip install umap

In [1]:
import matplotlib as mpl

import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import scipy
import networkx as nx

import seaborn as sns
import pandas as pd

# For Cleaning and Topic Modeling
import spacy
import emoji
import re
nlp = spacy.load('en_core_web_sm')
from bertopic import BERTopic
# Initialize the umap model, mostly to set the seed
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

%config InlineBackend.figure_format = 'svg'
plt.rcParams.update({'font.size': 10, 'font.style': 'normal', 'font.family':'serif'})

In [2]:
# Loading dataframe
data = pd.read_json("network_data.json")
# Convert columns to datetime format
data.loc[:, 'son_date'] = pd.to_datetime(data['son_date'])
data.loc[:, 'mother_date'] = pd.to_datetime(data['mother_date'])
# Open and read the JSON file containing communities as keys and authors composing them as values
with open("communities2023-10-07_2023-12-31.json", 'r') as file:
    communities = json.load(file)

In [3]:
for k in communities.keys():
    print(k)

0
5
2
3
4
1
6
7


# Functions to run

In [4]:
# Extracting the texts from the dataframe
def extract_texts(communities, df_tm, verbose=True):
    '''
    Function that extracts all texts from a list of authors grouped by communities.
    
    Parameters:
    communities (dict): Dictionary where keys are community identifiers and values are lists of authors.
    df_tm (DataFrame): DataFrame containing text data with 'author_son', 'author_mother', 'text_son', and 'text_mother' columns.
    
    Returns:
    dict: Dictionary where keys are community identifiers and values are lists of unique texts.
    '''
    
    # Initialize a dictionary to store texts for each community
    communities_texts = {}
    
    # Iterate over each community and its list of authors
    for community, authors in communities.items():
        # Initialize a set to store unique texts for the current community
        texts = set()
        
        # Iterate over DataFrame rows
        for ind, obs in df_tm.iterrows():
            # If the author is in the list of authors for the current community, add the text to the set
            if obs.author_son in authors:
                texts.add(obs.text_son)
            if obs.author_mother in authors:
                texts.add(obs.text_mother)
        
        # Convert the set of texts to a list and store it in the dictionary
        # Sorting for reproducibility
        communities_texts[community] = sorted(list(texts))

        if verbose:
            # Print the number of unique texts for the current community
            print(f"Community {community} has {len(communities_texts[community])} unique texts.")
    
    return communities_texts


In [5]:
# Cleaning the texts
def clean_texts(communities_texts):
    '''
    Function that cleans texts and groups them by community.
    
    Parameters:
    communities_texts (dict): Dictionary where keys are community identifiers and values are lists of texts.
    
    Returns:
    dict: Dictionary where keys are community identifiers and values are lists of cleaned texts.
    '''
    
    def clean(text):
        '''
        Function that cleans a given text by lemmatizing and removing unwanted characters.
        
        Parameters:
        text (str): The text to be cleaned.
        
        Returns:
        str: The cleaned text.
        '''
        # First step
        text = ' '.join([token.lemma_ 
                         for token in nlp(text) 
                         if token.pos_ in {'NOUN', 'ADJ', 'VERB', 'PROPN'}])
        
        # Second step
        text = text.lower()
        text = emoji.demojize(text)
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"@\w+", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        text = ' '.join([word for word in text.split() if len(word) > 2])
        return text
    
    # Initialize a dictionary to store cleaned texts for each community
    communities_cleaned = {}
    
    # Iterate over each community and its list of texts
    for community, texts in communities_texts.items():
        # Clean each text in the list
        cleaned_texts = [clean(text) for text in texts]
        
        # Store the cleaned texts in the dictionary
        communities_cleaned[community] = cleaned_texts
    
    return communities_cleaned

In [6]:
# Topic modeling for each community
def topics_to_communities(communities_cleaned, umap_model):
    '''
    Function that fits a BERTopic model to the cleaned texts for each community.
    
    Parameters:
    communities_cleaned (dict): Dictionary where keys are community identifiers and values are lists of cleaned texts.
    umap_model (umap.UMAP): A pre-trained UMAP model to be used with BERTopic.
    
    Returns:
    dict: Dictionary where keys are community identifiers and values are df from top5 BERTopic models fitted to the community texts.
    '''
    
    # Initialize a dictionary to store BERTopic models for each community
    communities_topics = {}
    
    # Iterate over each community and its list of cleaned texts
    for community, texts in communities_cleaned.items():
        # Create and fit a BERTopic model for the current community
        topic_model = BERTopic(verbose=False, umap_model=umap_model)
        topic_model.fit_transform(texts)
        
        # Store the fitted BERTopic model in the dictionary
        communities_topics[community] = topic_model.get_topic_info().loc[0:5, :]
    
    return communities_topics

# Generating the final dataset

In [7]:
def subset_and_apply(data, communities, date1, date2):
    '''
    Function that applies topic modeling to each communities, in a given time period

    Parameters:
    data (DataFrame): DataFrame containing extracted posts and comments from r/IsraelPalestinr
    communities: (dict): Dictionary where keys are community identifiers and values are lists of authors.
    date1 (string, format: YYYY-MM-DD): date from which the analysis should start
    date2 (string, format: YYYY-MM-DD): date from which the analysis should end
    
    Returns:
    dict: Dictionary where keys are community identifiers and values are DataFrames containing Topic Modeling results.
    '''

    # Convert to datetime object
    date1 = pd.to_datetime(date1)
    date2 = pd.to_datetime(date2)

    # Filter the DataFrame 
    df_tm = data[(data['son_date'] > date1) & (data['mother_date'] < date2) &
        (data['son_date'] > date1) & (data['mother_date'] < date2)]

    # Extract texts from Df
    communities_texts = extract_texts(communities, df_tm)

    # Cleaning texts
    communities_cleaned = clean_texts(communities_texts)

    # Applying topic modeling
    communities_topics = topics_to_communities(communities_cleaned, umap_model)

    return communities_topics

In [8]:
output = subset_and_apply(data, communities, '2023-10-07', '2023-12-31')

Community 0 has 11183 unique texts.
Community 5 has 8089 unique texts.
Community 2 has 5308 unique texts.
Community 3 has 9041 unique texts.
Community 4 has 8890 unique texts.
Community 1 has 10243 unique texts.
Community 6 has 6986 unique texts.
Community 7 has 6319 unique texts.


In [9]:
def export_to_json(dict_of_dfs, json_file_path):
    '''
    Function to export a dictionary containing DataFrames as values to a JSON file.
    
    Parameters:
    dict_of_dfs (dict): Dictionary where keys are identifiers and values are DataFrames.
    json_file_path (str): Path to the output JSON file.
    '''
    
    # Initialize a dictionary to hold JSON-compatible data
    dict_of_json = {}
    
    # Convert each DataFrame to a dictionary
    for key, df in dict_of_dfs.items():
        dict_of_json[key] = df.to_dict(orient='records')
    
    # Serialize the dictionary to a JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(dict_of_json, json_file, indent=4)
        

# Export the dictionary to a JSON file
export_to_json(output, 'topic_1007_to_1231.json')

In [10]:
len(output)

8

In [11]:
def import_from_json(json_file_path):
    '''
    Function to import a dictionary containing DataFrames from a JSON file.
    
    Parameters:
    json_file_path (str): Path to the input JSON file.
    
    Returns:
    dict: Dictionary where keys are identifiers and values are DataFrames.
    '''
    
    # Read the JSON file into a dictionary
    with open(json_file_path, 'r') as json_file:
        dict_of_json = json.load(json_file)
    
    # Convert each JSON-compatible structure back into a DataFrame
    dict_of_dfs = {key: pd.DataFrame(value) for key, value in dict_of_json.items()}
    
    return dict_of_dfs


topic_example1 = import_from_json('topic_1007_to_1231.json')

In [26]:
topic_example1["6"].iloc[0].Representation

['israel',
 'hamas',
 'have',
 'people',
 'palestinians',
 'say',
 'jews',
 'gaza',
 'war',
 'kill']