In [10]:
import pandas as pd
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import networkx as nx
from collections import defaultdict
import pickle
import numpy as np
from tqdm import tqdm


In [21]:
def build_graph_from_data(video_users,backup_folder):
    G = nx.Graph()
    G.add_nodes_from(video_users.keys())
    keys = list(video_users.keys())
    for i,video in enumerate(tqdm(keys)):
        for j in range(i+1,len(keys)):
            other_video=keys[j]
            if video==other_video:
                continue
            weight = len(video_users[video].intersection(video_users[other_video]))/len(video_users[video].union(video_users[other_video]))
            if weight >0:
                G.add_edge(video,other_video,weight=weight)
    pickle.dump(G, open(backup_folder+'/unsigned_graph.pickle', 'wb'))
    return G

def create_unsigned_graph(backup_folder='/Graph',comment_file='./Data/complete_comments.jsonl'):
    current_video = ''
    video_users = {}
    user_int_mapping = {}
    video_int_mapping = {}
    user_set = set()
    
    videos = pd.read_json('Data/video_metadata.jsonl',lines=True)[['videoId','category']]
    
    for chunk in pd.read_json(comment_file,encoding='utf-8',chunksize=10**6,lines=True):
        chunk = pd.merge(chunk,videos,on='videoId')
        if current_video == '':
            current_video=chunk.iloc[0]['videoId']
        for line in chunk.to_dict(orient='records'):
            if int(line['category']) >=30:
                continue
            if current_video != line['videoId']:
                video_int_mapping[current_video] = len(video_int_mapping)
                video_users[video_int_mapping[current_video]]=user_set
                user_set=set()
                current_video = line['videoId']
                
            if line['userId'] not in user_int_mapping:
                user_int_mapping[line['userId']] = len(user_int_mapping)
            user_set.add(user_int_mapping[line['userId']])
    with open(backup_folder+'/user_index','wb') as file:
        pickle.dump(user_int_mapping,file)
    with open(backup_folder+'/video_index','wb') as file:
        pickle.dump(video_int_mapping,file)
    with open(backup_folder+'/video_users','wb') as file:
        pickle.dump(video_users,file)
    
    print('finished reading file')
    
    return build_graph_from_data(video_users,backup_folder)

In [17]:
def build_graph_from_sentiment_data(video_users,user_int_mapping,video_int_mapping):
    G = nx.Graph()
    G.add_nodes_from(video_users.keys())
    keys = list(video_users.keys())
    for i,video in tqdm(enumerate(keys)):
        for j in range(i+1,len(keys)):
            other_video=keys[j]
            if video==other_video:
                continue
            intersection = set(video_users[video].keys()).intersection(set(video_users[other_video].keys()))
            sentiments = []
            for element in intersection:
                sentiments.append(np.mean(video_users[video][element])*np.mean(video_users[other_video][element]))
            weight = np.mean(sentiments)
            if weight != 0:
                G.add_edge(video,other_video,weight=weight)
    pickle.dump(G, open('Graph/signed/signed_graph.pickle', 'wb'))
    return G

def create_signed_graph():
    analyzer = SentimentIntensityAnalyzer()
    
    current_video = ''
    video_users = {}
    user_int_mapping = {}
    video_int_mapping = {}
    user_polarities = defaultdict(list)
    for chunk in pd.read_json('./Data/complete_comments.jsonl',encoding='utf-8',chunksize=10**6,lines=True):
        if current_video == '':
            current_video=chunk.iloc[0]['videoId']
        for line in chunk.to_dict(orient='records'):
            if current_video != line['videoId']:
                video_int_mapping[current_video] = len(video_int_mapping)
                video_users[video_int_mapping[current_video]]=user_polarities
                user_polarities=defaultdict(list)
                current_video = line['videoId']
                
            if line['userId'] not in user_int_mapping:
                user_int_mapping[line['userId']] = len(user_int_mapping)
            user_polarities[user_int_mapping[line['userId']]].append(analyzer.polarity_scores(line['comment'])['compound'])
    with open('Graph/signed/user_index','wb') as file:
        pickle.dump(user_int_mapping,file)
    with open('Graph/signed/video_index','wb') as file:
        pickle.dump(video_int_mapping,file)
    with open('Graph/signed/video_users','wb') as file:
        pickle.dump(video_users,file)
    return build_graph_from_sentiment_data(video_users,user_int_mapping,video_int_mapping)

In [4]:
%%time
G=create_unsigned_graph('Graph')

finished reading file
CPU times: total: 7h 40min 34s
Wall time: 14h 47min 53s


In [22]:
%%time
S = create_unsigned_graph('Graph/small','./Data/comments.jsonl')

finished reading file


100%|██████████████████████████████████████████████████████████████████████████████| 8332/8332 [50:13<00:00,  2.77it/s]


CPU times: total: 26min 45s
Wall time: 53min 52s
