In [None]:
!pip install pandas networkx nltk zstandard orjson

Collecting orjson
  Obtaining dependency information for orjson from https://files.pythonhosted.org/packages/5d/67/d7837cf0ac956e3c81c67dda3e8f2ffc60dd50ffc480ec7c17f2e22a36ae/orjson-3.9.10-cp311-none-win_amd64.whl.metadata
  Downloading orjson-3.9.10-cp311-none-win_amd64.whl.metadata (50 kB)
     ---------------------------------------- 0.0/50.5 kB ? eta -:--:--
     --------------- ---------------------- 20.5/50.5 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 50.5/50.5 kB 638.6 kB/s eta 0:00:00
Downloading orjson-3.9.10-cp311-none-win_amd64.whl (135 kB)
   ---------------------------------------- 0.0/135.0 kB ? eta -:--:--
   --------------------------- ------------ 92.2/135.0 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 135.0/135.0 kB 2.7 MB/s eta 0:00:00
Installing collected packages: orjson
Successfully installed orjson-3.9.10


<h1>Importing necessary libraries and downloading the ‘vader_lexicon’ package</h1>

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Make sure you have downloaded the 'vader_lexicon' package
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\feder\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

<h1> Reading and cleaning the data</h1>

In [None]:
start_year = 2007
end_year = 2022
chunksize = 10**6  # Set the chunk size

# For each year in the specified range
for year in range(start_year, end_year + 1):
    # Create the file name
    file_name = f"politics_comments.zst_{year}.csv"

    # Initialize an empty DataFrame for the comments
    df_comments = pd.DataFrame()

    # Read the file in chunks
    for chunk in pd.read_csv(file_name, names=["author","body","date","parent_id","id","score","author_fullname","ups"], chunksize=chunksize):
        # Filter rows where 'author' or 'body' is '[deleted]' or '[removed]'
        chunk = chunk[~chunk['author'].isin(['[deleted]', '[removed]'])]
        chunk = chunk[~chunk['body'].isin(['[deleted]', '[removed]'])]

        # Convert the 'date' column to datetime format
        chunk['date'] = pd.to_datetime(chunk['date'], unit='s')

        # Add the chunk to the DataFrame
        df_comments = pd.concat([df_comments, chunk])

    # Create a global variable for the DataFrame
    globals()[f"politics_comments_{year}"] = df_comments

    # Create the file name for the submissions
    file_name_submissions = f"politics_submissions.zst_{year}.csv"

    # Initialize an empty DataFrame for the submissions
    df_submissions = pd.DataFrame()

    # Read the file in chunks
    for chunk in pd.read_csv(file_name_submissions, names=["author","created_utc","id","num_comments","title","score","author_fullname","ups"], chunksize=chunksize):
        # Filter rows where 'author' is '[deleted]' or '[removed]'
        chunk = chunk[~chunk['author'].isin(['[deleted]', '[removed]'])]

        # Convert the 'created_utc' column to datetime format
        chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

        # Add the chunk to the DataFrame
        df_submissions = pd.concat([df_submissions, chunk])

    # Create a global variable for the DataFrame
    globals()[f"politics_submissions_{year}"] = df_submissions


<h1> Sentiment analysis and splitting the ID</h1>

In [None]:
# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

def get_sentiment_scores(comment):
    scores = sia.polarity_scores(comment)
    return pd.Series(scores)

def split_id(fullname):
    # Extract the type and ID from the fullname
    type_prefix, id_ = fullname.split('_')
    # Map the type prefix to a more descriptive string
    type_ = 'comment' if type_prefix == 't1' else 'post' if type_prefix == 't3' else 'unknown'
    return pd.Series([type_, id_])

start_year = 2007
end_year = 2022

# For each dataframe in that range (inclusive of start and end), the code does a series of operations
for year in range(start_year, end_year + 1):
    # Get the dataframes for the current year
    politics_comments = globals()[f'politics_comments_{year}']
    politics_submissions = globals()[f'politics_submissions_{year}']

    # Sentiment analysis with VADER
    politics_comments['body'] = politics_comments['body'].fillna('').astype(str)
    sentiment_scores = politics_comments['body'].apply(get_sentiment_scores)
    sentiment_scores.columns = ['negative', 'neutral', 'positive', 'sentiment']
    politics_comments = pd.concat([politics_comments, sentiment_scores], axis=1)

    # Split fullname
    politics_comments[['parent_type', 'parent_id']] = politics_comments['parent_id'].apply(split_id)

    # Creation of the corresponding year's connections dataframe
    id_author_map_posts = politics_submissions[['id', 'author']].set_index('id').to_dict()['author']
    id_author_map_comments = politics_comments[['id', 'author']].set_index('id').to_dict()['author']
    connections = politics_comments[['author', 'id', 'parent_id', 'sentiment']].copy()
    connections['parent_author'] = politics_comments.apply(lambda row: id_author_map_posts.get(row['parent_id']) if row['parent_type'] == 'post' else id_author_map_comments.get(row['parent_id']), axis=1)

    # Save the connections dataframe as a global variable
    globals()[f'connections_{year}'] = connections


<h1> Saving the dataframes</h1>

In [None]:
import pickle

def save_politics_dataframes(start_year, end_year):
    for year in range(start_year, end_year+1):
        for data_type in ['submissions', 'comments']:
            df_name = f'politics_{data_type}_{year}'
            df = globals()[df_name]
            with open(f'{df_name}.pickle', 'wb') as f:
                pickle.dump(df, f)

def save_connections_dataframes(start_year, end_year):
    for year in range(start_year, end_year+1):
        df_name = f'connections_{year}'
        df = globals()[df_name]
        with open(f'{df_name}.pickle', 'wb') as f:
            pickle.dump(df, f)

# Call the functions
save_politics_dataframes(2007, 2022)
save_connections_dataframes(2007, 2022)


<h1>Processing the data and creating the graph</h1>

In [None]:
import pickle
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gc

def process_year(year):
    # Name of the pickle file
    filename = f'connections_{year}.pickle'

    # Open the pickle file and turn it into a dataframe
    with open(filename, 'rb') as f:
        df = pickle.load(f)

    # Create a boolean mask for the rows to keep
    mask = (df['parent_author'].notna()) & (df['author'] != df['parent_author'])

    # Apply the mask to the DataFrame
    df = df.loc[mask]

    # Save the modified dataframe with the same name as the original
    with open(filename, 'wb') as f:
        pickle.dump(df, f)

    # Create a new DataFrame grouped by 'author' and 'parent_author'
    df_grouped = df.groupby(['author', 'parent_author'])

    # Create a DataFrame with 'number_of_interactions' and 'avg_sentiment'
    df_grouped = df_grouped.agg(
        number_of_interactions=('author', 'count'),
        avg_sentiment=('sentiment', 'mean')
    ).reset_index()

    # Create a new column 'interaction_sentiment_weighted'
    df_grouped['interaction_sentiment_weighted'] = df_grouped['avg_sentiment'] * df_grouped['number_of_interactions']

    # Save the grouped dataframe as a pickle
    with open(f'groupby_connections_{year}.pickle', 'wb') as f:
        pickle.dump(df_grouped, f)

    # Create a full directed graph
    G_full = nx.from_pandas_edgelist(df_grouped, 'author', 'parent_author', ['number_of_interactions', 'avg_sentiment'], create_using=nx.DiGraph())

    # Save the graph
    with open(f'graph_{year}.pkl', 'wb') as f:
        pickle.dump(G_full, f)

    # Delete the data to save memory
    del df
    del df_grouped
    del G_full
    gc.collect()

def process_range(start_year, end_year):
    for year in range(start_year, end_year + 1):
        process_year(year)

# Run the process_range function with the desired range of years
process_range(2007, 2022)