In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle
import math
import networkx as nx

In [2]:
def get_stats(array):
    print("Mean:",np.mean(array))
    print("Percentiles")
    for i in [5,25,50,75,90,99, 99.5, 99.7]:
        print(f"{i} - {round(np.percentile(array, i))}")
    print()

In [3]:
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    try:
        cleantext = re.sub(CLEANR, '', raw_html)
    except:
        print(raw_html, type(raw_html))
        input()
    return cleantext


In [4]:
all_tags = pd.read_xml('./stackoverflow/sqls/Tags.xml')

In [5]:
all_posts = pd.read_xml('./stackoverflow/sqls/Posts.xml', parser='etree')

In [6]:
all_comments = pd.read_xml('./stackoverflow/sqls/Comments.xml', parser='etree')

In [7]:
all_users = pd.read_xml('./stackoverflow/sqls/Users.xml')
all_votes = pd.read_xml('./stackoverflow/sqls/Votes.xml')

In [8]:
user_data = {}
for row in all_users.itertuples():
    if row[1]==-1: continue
    id_ = int(row[1])
    udict = {'uname': row[4], 'bio': cleanhtml(row[8]) if type(row[8])==str else row[8], 'views': row[9], 'upvotes':row[10], 'downvotes':row[11],
                'questions':[], 'answers':[], 'comments':[]}
    user_data[id_] = udict

In [9]:
len(user_data)

37616

In [10]:
edges = defaultdict(list)
post_data = {}
for row in all_posts.itertuples():
    if row[2] not in [1,2]: continue
    tags = row[13][1:-1].split('><') if row[13] else row[13]
    parent_id = -1 if row[2]==1 else int(row[17])
    pdict = {'type':row[2], 'accepted_answer_id':row[3], 'score':row[5], 'views':row[6], 'text':cleanhtml(row[7]),
            'user_id': row[8], 'title':row[12], 'tags':tags, 'rootid': parent_id,  }
    
    if row[2]==1:
        if not math.isnan(row[8]):
            user_data[int(row[8])]['questions'].append(row[1])
    else: 
        edges[parent_id].append([parent_id, row[1]])
        if not math.isnan(row[8]):
            user_data[int(row[8])]['answers'].append(row[1])
            
    post_data[row[1]] = pdict

In [11]:
len(post_data), len(edges)

(49998, 14537)

In [12]:
for row in all_comments.itertuples():
    if row[2] not in post_data: continue
    cid = 'c'+str(row[1])
    
    rootid = post_data[row[2]]['rootid'] if post_data[row[2]]['type']==2 else row[2]
    pdict = {'type':3, 'accepted_answer_id':None, 'score':row[3], 'views':None, 'text':cleanhtml(row[4]),
            'user_id': row[6], 'title':None, 'tags':None, 'rootid': rootid,  }
    post_data[cid] = pdict
    edges[rootid].append([row[2], cid])
    
    if row[6] and not math.isnan(row[6]) and row[6]!=-1:
        user_data[int(row[6])]['comments'].append(cid)

In [13]:
trees = {}
for i,j in edges.items():
    trees[i] = nx.DiGraph(j)

In [14]:
len(trees)

15302

In [72]:
def get_tree_stats(from_rids):
    tree_answers = []
    tree_comments = []
    tree_view = []
    tree_score = []
    tree_topics = []
    tree_length= []
    topic_freq = defaultdict(int)
    tree_id = []
    for rid, G in trees.items():
        if rid not in from_rids:continue
        acnt=0
        ccnt=0
        for pid in G:
            if post_data[pid]['type']==1:
                tree_view.append(post_data[pid]['views'])
                tree_score.append(post_data[pid]['score'])

                if post_data[pid]['tags']:
#                     print(len(post_data[pid]['tags']))
                    tree_topics.append(len(post_data[pid]['tags']))
                    for t in post_data[pid]['tags']:
                        topic_freq[t]+=1
                else:
                    tree_topics.append(0)
            elif post_data[pid]['type']==2: acnt+=1
            elif post_data[pid]['type']==3: ccnt+=1
        tree_comments.append(ccnt)
        tree_answers.append(acnt)
        tree_length.append(ccnt+acnt)
        tree_id.append(rid)
    
    print("topics")
    get_stats(tree_topics)
    print("top freq")
    get_stats(list(topic_freq.values()))
#     print('questions')
#     get_stats(tree_questions)
    print('answers')
    get_stats(tree_answers)
    get_stats(tree_comments)
    print('length')
    get_stats(tree_length)
    return tree_length, tree_id

In [30]:
tree_length,tree_id=get_tree_stats(list(trees.keys()))

topics
Mean: 55.558394160583944
Percentiles
5 - 1
25 - 4
50 - 12
75 - 40
90 - 109
99 - 635
99.5 - 797
99.7 - 946

answers
Mean: 2.257548032936871
Percentiles
5 - 1
25 - 1
50 - 2
75 - 3
90 - 5
99 - 11
99.5 - 13
99.7 - 15

Mean: 11.48039471964449
Percentiles
5 - 0
25 - 3
50 - 7
75 - 15
90 - 27
99 - 68
99.5 - 82
99.7 - 91

Mean: 13.737942752581361
Percentiles
5 - 1
25 - 4
50 - 9
75 - 17
90 - 31
99 - 78
99.5 - 93
99.7 - 105



In [16]:
sorted(topic_freq.items(), key=lambda x:x[1], reverse=True)

[('united-states', 6742),
 ('united-kingdom', 1534),
 ('election', 978),
 ('european-union', 908),
 ('russian-federation', 809),
 ('president', 697),
 ('international-relations', 677),
 ('law', 664),
 ('donald-trump', 642),
 ('presidential-election', 609),
 ('brexit', 596),
 ('economy', 564),
 ('congress', 546),
 ('constitution', 527),
 ('china', 498),
 ('voting', 473),
 ('armed-conflict', 464),
 ('ukraine', 436),
 ('democracy', 403),
 ('parliament', 400),
 ('parties', 395),
 ('military', 388),
 ('international-law', 381),
 ('political-theory', 360),
 ('senate', 347),
 ('india', 340),
 ('government', 320),
 ('covid-19', 310),
 ('history', 293),
 ('international', 283),
 ('taxes', 270),
 ('terminology', 262),
 ('supreme-court', 246),
 ('united-nations', 238),
 ('voting-systems', 236),
 ('public-opinion', 235),
 ('policy', 231),
 ('trade', 227),
 ('israel', 223),
 ('immigration', 213),
 ('germany', 210),
 ('media', 209),
 ('house-of-representatives', 204),
 ('impeachment', 196),
 ('healt

In [32]:
idxs = np.where(np.array(tree_length)>=31)

In [33]:
top10_tree_id = np.array(tree_id)[idxs]

In [34]:
len(top10_tree_id)

1552

In [35]:
top10_users=set()
top10_posts=set()
for rid in top10_tree_id:
    for pid in trees[rid]:
        top10_users.add(post_data[pid]['user_id'])
        top10_posts.add(pid)

In [36]:
len(top10_users), len(top10_posts)

(8826, 79130)

In [37]:
user_views = []
user_num_questions = []
user_num_answers = []
user_num_comments = []
user_interaction = []
user_topics = []

for uid,udict in user_data.items():
    user_views.append(udict['views'])
    
    user_num_questions.append(len(udict['questions']))
    user_num_answers.append(len(udict['answers']))
    user_num_comments.append(len(udict['comments']))
    user_interaction.append(user_num_questions[-1]+user_num_answers[-1]+user_num_comments[-1])
    
    utopics = defaultdict(int)
    for pid in udict['questions']:
        if post_data[pid]['tags']:
            for t in post_data[pid]['tags']:
                utopics[t]+=2
    for pid in udict['answers']:
        rid = post_data[pid]['rootid']
        if post_data[rid]['tags']:
            for t in post_data[rid]['tags']:
                utopics[t]+=1
    for pid in udict['comments']:
        rid = post_data[pid]['rootid']
        if post_data[rid]['tags']:
            for t in post_data[rid]['tags']:
                utopics[t]+=0.5
    user_topics.append(sum(utopics.values()))
                
        

In [38]:
get_stats(user_interaction)

Mean: 5.711027222458529
Percentiles
5 - 0
25 - 0
50 - 0
75 - 1
90 - 4
99 - 85
99.5 - 177
99.7 - 313



In [39]:
print("Questions per user")
get_stats(user_num_questions)

Questions per user
Mean: 0.3885580603998299
Percentiles
5 - 0
25 - 0
50 - 0
75 - 0
90 - 1
99 - 5
99.5 - 10
99.7 - 16



In [40]:
print("Answers per user")
get_stats(user_num_answers)

Answers per user
Mean: 0.884543811144194
Percentiles
5 - 0
25 - 0
50 - 0
75 - 0
90 - 1
99 - 11
99.5 - 25
99.7 - 49



In [41]:
print("Comments per user")
get_stats(user_num_comments)

Comments per user
Mean: 4.4379253509145045
Percentiles
5 - 0
25 - 0
50 - 0
75 - 0
90 - 2
99 - 67
99.5 - 139
99.7 - 259



In [42]:
print("Views per user")
get_stats(user_views)

Views per user
Mean: 11.260740110591238
Percentiles
5 - 0
25 - 0
50 - 0
75 - 2
90 - 7
99 - 198
99.5 - 393
99.7 - 639



In [43]:
print("Topics per user")
get_stats(user_topics)

Topics per user
Mean: 11.798383666524883
Percentiles
5 - 0
25 - 0
50 - 0
75 - 2
90 - 9
99 - 177
99.5 - 361
99.7 - 595



In [52]:
def get_user_stats(uid_set, top10_posts):
    user_views = []
    user_num_questions = []
    user_num_answers = []
    user_num_comments = []
    user_interaction = []
    user_topics = []
    user_ids = []

    for uid,udict in user_data.items():
        if uid not in uid_set: continue
        user_views.append(udict['views'])
        user_ids.append(uid)
        qids = [i for i in udict['questions'] if i in top10_posts]
        aids = [i for i in udict['answers'] if i in top10_posts]
        cids = [i for i in udict['comments'] if i in top10_posts]

        user_num_questions.append(len(qids))
        user_num_answers.append(len(aids))
        user_num_comments.append(len(cids))
        user_interaction.append(len(qids)+len(aids)+len(cids))

        utopics = defaultdict(int)
        for pid in qids:
            if post_data[pid]['tags']:
                for t in post_data[pid]['tags']:
                    utopics[t]+=2
        for pid in aids:
            rid = post_data[pid]['rootid']
            if post_data[rid]['tags']:
                for t in post_data[rid]['tags']:
                    utopics[t]+=1
        for pid in cids:
            rid = post_data[pid]['rootid']
            if post_data[rid]['tags']:
                for t in post_data[rid]['tags']:
                    utopics[t]+=0.5
        user_topics.append(sum(utopics.values()))
        
    print("Interactions per user")
    get_stats(user_interaction)
    
    print("Topics per user")
    get_stats(user_topics)
    
    return user_interaction, user_ids

In [45]:
top10_users - set(user_data.keys())

{nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [46]:
len(top10_users)

8826

In [53]:
u_inters, user_ids = get_user_stats(top10_users, top10_posts)

Interactions per user
Mean: 13.91182953710507
Percentiles
5 - 1
25 - 1
50 - 2
75 - 7
90 - 23
99 - 223
99.5 - 337
99.7 - 593

Topics per user
Mean: 25.917983100661278
Percentiles
5 - 1
25 - 2
50 - 5
75 - 13
90 - 42
99 - 411
99.5 - 651
99.7 - 1050



In [54]:
len(u_inters)

5444

In [55]:
idxs = np.where(np.array(u_inters)>=23)
top10_tree_top10_users = np.array(user_ids)[idxs]

In [56]:
top10_tree_top10_users = set(top10_tree_top10_users)

In [57]:
len(top10_tree_top10_users)

552

In [58]:
chosen_trees = []
for rid in top10_tree_id:
    cnt=0
    for pid in trees[rid]:
        if post_data[pid]['type'] in [2] and post_data[pid]['user_id'] in top10_tree_top10_users:
            cnt+=1
    if cnt>=2:
        chosen_trees.append(rid)

In [59]:
len(chosen_trees)

1456

In [63]:
_,_=get_user_stats(top10_tree_top10_users,top10_posts)

Interactions per user
Mean: 103.77898550724638
Percentiles
5 - 24
25 - 33
50 - 49
75 - 94
90 - 223
99 - 907
99.5 - 1113
99.7 - 1200

Topics per user
Mean: 189.89402173913044
Percentiles
5 - 40
25 - 58
50 - 90
75 - 176
90 - 407
99 - 1590
99.5 - 1911
99.7 - 2159



In [70]:
_,_=get_tree_stats(chosen_trees)

topics
Mean: 3.161401098901099
Percentiles
5 - 1
25 - 2
50 - 3
75 - 4
90 - 5
99 - 5
99.5 - 5
99.7 - 5

top freq
Mean: 9.187624750499001
Percentiles
5 - 1
25 - 1
50 - 3
75 - 7
90 - 19
99 - 79
99.5 - 105
99.7 - 134

answers
Mean: 6.638049450549451
Percentiles
5 - 3
25 - 4
50 - 6
75 - 8
90 - 11
99 - 18
99.5 - 20
99.7 - 22

Mean: 44.03846153846154
Percentiles
5 - 27
25 - 31
50 - 37
75 - 50
90 - 70
99 - 126
99.5 - 146
99.7 - 153

length
Mean: 50.676510989010985
Percentiles
5 - 31
25 - 36
50 - 43
75 - 57
90 - 79
99 - 143
99.5 - 163
99.7 - 169



In [53]:
chosen_trees

[7,
 17,
 135,
 179,
 238,
 256,
 319,
 323,
 327,
 345,
 404,
 429,
 482,
 489,
 537,
 590,
 586,
 607,
 918,
 1081,
 1084,
 1095,
 1126,
 1211,
 1147,
 1279,
 1302,
 1389,
 1435,
 1688,
 1809,
 1671,
 1838,
 2061,
 2113,
 2186,
 2350,
 2461,
 2471,
 2499,
 2588,
 2601,
 2578,
 2710,
 2714,
 2744,
 2809,
 2834,
 2842,
 2959,
 3009,
 3022,
 2988,
 3091,
 3098,
 3292,
 3285,
 3339,
 3413,
 4481,
 4626,
 4639,
 4681,
 4677,
 4732,
 4476,
 4839,
 5004,
 4978,
 6045,
 6163,
 6184,
 6222,
 6286,
 6291,
 6318,
 6345,
 6428,
 6459,
 6522,
 6588,
 7619,
 7737,
 7762,
 7792,
 7816,
 7833,
 7874,
 7948,
 8012,
 8029,
 8135,
 8158,
 8209,
 8277,
 8416,
 8425,
 8499,
 8531,
 8598,
 8644,
 8742,
 1743,
 8921,
 8969,
 9107,
 9159,
 9184,
 9226,
 9249,
 9282,
 9292,
 9368,
 9421,
 9470,
 9525,
 9670,
 10006,
 10010,
 10021,
 10119,
 10214,
 10211,
 10238,
 10266,
 10343,
 10394,
 10457,
 10485,
 10536,
 10853,
 10923,
 10960,
 11012,
 11021,
 11168,
 11344,
 11368,
 11450,
 11505,
 11575,
 11644,
 11

In [76]:
pickle.dump([trees, user_data, post_data, top10_tree_top10_users, chosen_trees], open('so_politics.pkl','wb'))

In [75]:
len(trees), len(top10_tree_top10_users), len(chosen_trees)

(15302, 552, 1456)

In [77]:
from sklearn.model_selection import train_test_split
train_trees, test_trees = train_test_split(chosen_trees, test_size=0.33, random_state=0)

In [None]:
_,_=get_tree_stats(train_trees)