In [1]:
!pip install pandas networkx



In [86]:
import pandas as pd
import networkx as nx
import re

In [54]:
sentiments_raw = pd.read_csv('edge_sentiments.txt', header=None)

In [76]:
def preprocess_sentiment(sentiment):
    sentiment = sentiment[len("Sentiment: "):].strip()
    return 1 if sentiment == "non-burst" else -1

sentiments = pd.DataFrame(columns=["source_post", "dest_post", "sentiment"])
sentiments.iloc[:, 0] = sentiments_raw.iloc[:, 0].apply(lambda val: val[len("From Post: "):].strip())
sentiments.iloc[:, 1] = sentiments_raw.iloc[:, 1].apply(lambda val: val[len("To Post: "):].strip())
sentiments[sentiments.columns[2]] = sentiments_raw.iloc[:, 2].apply(preprocess_sentiment)
print(sentiments.head())
# Create a directed graph with sentiment as an edge attribute
posts_graph = nx.from_pandas_edgelist(sentiments, 'source_post', 'dest_post', create_using=nx.DiGraph, edge_attr='sentiment')

# # Check if the edge "1u4nrp" -> "1u4lo2" exists in the graph
# src = "1u4nrp"
# dst = "1u4lo2"
# if posts_graph.has_edge(src, dst):
#     print(f"The edge '{src}' -> '{dst}' exists with sentiment: {posts_graph[src][dst]['sentiment']}")
# else:
#     print(f"The edge '{src}' -> '{dst}' does not exist in the graph.")

  source_post dest_post  sentiment
0      1u4nrp    1u4lo2          1
1      1u4qkd    1u4muc          1
2      1u4qlz    1u4pss          1
3      1u4sjv    1u4mjo          1
4      1u4w5s    1u54ij          1


In [134]:
def create_dataframe_from_file(file_name):
    # Read the file and split lines
    with open(file_name, 'r') as file:
        lines = file.readlines()

    # Extract data and create a DataFrame
    rows = [line.replace("\t", " ").replace('"', '').replace(',','').replace("'", '').replace(")", '').replace("(", '').replace('\n', '').split(" ") for line in lines]
    df = pd.DataFrame(rows, columns=['source_post', 'dest_post', 'sentiment'])

    # Convert sentiment to integer
    df['sentiment'] = df['sentiment'].map({'non-burst': 1, 'burst': 0})

    return df

In [136]:
sentiments_raw = create_dataframe_from_file('label_info.tsv')
print(sentiments_raw.head())
print(sentiments_raw.iloc[155934])

  source_post dest_post  sentiment
0      2vjbm2    2m13kx          1
1      3zagiy    2uf8t2          1
2      5yoid6    4i76wl          1
3      39rq01    3b41ta          1
4      4jp1x6    4jlea9          1
source_post    4hlaja
dest_post      4h44gb
sentiment           1
Name: 155934, dtype: object


In [137]:
posts_graph = nx.from_pandas_edgelist(sentiments_raw, 'source_post', 'dest_post', create_using=nx.DiGraph, edge_attr='sentiment')

In [30]:
def create_graph_from_file(file_name):
    # Read the text file into a DataFrame, skipping the first four rows
    df = pd.read_csv(file_name, skiprows=4, delimiter='\t', header=None)

    # Create column names "source" and "dest"
    df.columns = ["source_subreddit", "dest_subreddit"]

    # Create a directed graph from the DataFrame
    G = nx.from_pandas_edgelist(df, 'source_subreddit', 'dest_subreddit', create_using=nx.DiGraph)
    return G

In [39]:
yearly_graphs = {}
for year in [2014, 2015, 2016, 2017]:
    yearly_graphs[year] = create_graph_from_file(f"pruned_graph_{year}.txt")

In [40]:
# Update edge attributes in the graph based on 'sentiments'
for edge in graph.edges:
    source, dest = edge

    # Check if the edge exists in the 'sentiments' DataFrame
    mask = (sentiments['source'] == source) & (sentiments['dest'] == dest)
    if not sentiments[mask].empty:
        # Edge exists, update the 'sentiment' attribute
        sentiment = sentiments.loc[mask, 'sentiment'].values[0]
        graph[source][dest]['sentiment'] = sentiment
    else:
        # Edge doesn't exist in the DataFrame, raise an error or handle it as needed
        raise ValueError(f"Edge ({source}, {dest}) doesn't exist in the 'sentiments' DataFrame!")

# Now 'graph' has updated 'sentiment' attributes for existing edges in 'sentiments'

ValueError: Edge (1212297454, 1026390623) doesn't exist in the 'sentiments' DataFrame!

In [51]:
def read_data(file_path):
    return pd.read_csv(file_path, parse_dates=[3, 7], sep=' ', header=None,
                       names=["source_subreddit", "dest_subreddit", "source_post_id",
                              "source_post_date", "source_post_time", "user",
                              "dest_post_id", "dest_post_date", "dest_post_time", "sentiment"])

def create_graph(df):
    G = nx.MultiDiGraph()
    for index, row in df.iterrows():
        G.add_edge(row['source_subreddit'], row['dest_subreddit'],
                   source_post_id=row['source_post_id'],
                   dest_post_id=row['dest_post_id'])
    return G

def process_and_save_graphs(file_path):
    df = read_data(file_path)

    # Create a dictionary to store graphs for each year
    graphs_by_year = {}

    # Iterate over unique years in the data
    for year in df['source_post_date'].dt.year.unique():
        # Filter DataFrame for the current year
        year_df = df[df['source_post_date'].dt.year == year]

        # Create a graph
        G = create_graph(year_df)

        # Store the graph in the dictionary
        graphs_by_year[year] = G

        # Save the DataFrame to a CSV file
        year_df.to_csv(f'subreddit_graph_{year}.csv', index=False)

    return graphs_by_year

def print_edge_attributes(graph, source_subreddit, dest_subreddit):
    # Check if there's at least one edge between the source and destination nodes
    if graph.has_edge(source_subreddit, dest_subreddit):
        # Get all edges between the source and destination nodes
        edges = graph.out_edges(source_subreddit, dest_subreddit, keys=True)

        # Print edge attributes
        for edge in edges:
            source_post_id = graph[edge[0]][edge[1]][edge[2]]['source_post_id']
            dest_post_id = graph[edge[0]][edge[1]][edge[2]]['dest_post_id']

            print(f"Edge: {source_subreddit} -> {dest_subreddit}")
            print(f"  source_post_id: {source_post_id}")
            print(f"  dest_post_id: {dest_post_id}")
            print("----------------------")
    else:
        print(f"No edge found between {source_subreddit} and {dest_subreddit}")

In [150]:
def add_sentiment_column(file_path, save_path):
    # add sentiment data to file
    data_without_sentiment = pd.read_csv(file_path, parse_dates=[3, 7], sep=' ', header=None,
                           names=["source_subreddit", "dest_subreddit", "source_post_id",
                                  "source_post_date", "source_post_time", "user",
                                  "dest_post_id", "dest_post_date", "dest_post_time"])

    sentiments = []  # List to store sentiments

    for index, row in data_without_sentiment.iterrows():
        source_post_id = row["source_post_id"]
        dest_post_id = row["dest_post_id"]
        if posts_graph.has_edge(source_post_id, dest_post_id):
            sentiment = posts_graph[source_post_id][dest_post_id]['sentiment']
            sentiments.append(sentiment)
        else:
            raise Exception(f"No edge found between {source_post_id} and {dest_post_id}")

    # Add the 'sentiment' column to the DataFrame
    data_without_sentiment['sentiment'] = sentiments

    # Save the DataFrame to a TSV file
    data_without_sentiment.to_csv(save_path, index=False, sep=' ')

In [151]:
data_file = "formatted_data_file.txt"
add_sentiment_column(data_file, "formatted_data_file_with_sentimemnt.txt")

In [None]:
graphs_by_year = process_and_save_graphs(data_file)

In [78]:
# print_edge_attributes(graphs_by_year[2014], "leagueoflegends", "teamredditteams")