In [5]:
import xml.etree.ElementTree as ET
import pandas as pd
import networkx as nx
from collections import Counter

In [6]:
with open('Posts.xml', 'r') as xml_file:
    tree = ET.parse(xml_file)
    postroot = tree.getroot()
    users, questions, answers= dict(), dict(), dict()
    for postrow in postroot:
        post = postrow.attrib
        post_ori_Id  = post.pop('Id')
        posttype = post.pop('PostTypeId')
        if posttype == '1':
            assert post_ori_Id not in questions
            questions[post_ori_Id] = post
        elif posttype == '2':
            assert post_ori_Id not in answers
            answers[post_ori_Id] = post
            answers[post_ori_Id]['Accepted'] = False
print(questions['1'])

{'AcceptedAnswerId': '4', 'CreationDate': '2014-01-21T20:26:05.383', 'Score': '20', 'ViewCount': '2399', 'Body': '<p>I was offered a beer the other day that was reportedly made with citra hops. What are citra hops? Why should I care that my beer is made with them?</p>\n', 'OwnerUserId': '7', 'LastEditorUserId': '8', 'LastEditDate': '2014-01-21T22:04:34.977', 'LastActivityDate': '2014-01-21T22:04:34.977', 'Title': 'What is a citra hop, and how does it differ from other hops?', 'Tags': '<hops>', 'AnswerCount': '1', 'CommentCount': '0', 'ContentLicense': 'CC BY-SA 3.0'}


In [7]:
G = nx.Graph()
for asr in answers.keys():
    if 'OwnerUserId' in answers[asr]:
        ques = answers[asr]['ParentId']
        usr = answers[asr]['OwnerUserId']
        G.add_edge('A'+asr, 'Q'+ques)
        G.add_edge('A'+asr, 'U'+usr)
for que in questions.keys():
    if 'OwnerUserId' in questions[que]:
        usr = questions[que]['OwnerUserId']
        G.add_edge('U'+usr, 'Q'+ques)
    
largest_components=max(nx.connected_components(G),key=len) 
len(largest_components)
NG = G.subgraph(largest_components)
print(len(NG.nodes()), len(NG.edges()))
el = []
for n1,n2 in NG.edges():
    s = n1[0]
    t = n2[0]
    if s < t:
        el.append((s,t))
    else:
        el.append((t,s))
print(Counter(el))
user_Id, question_Id, answer_Id = 0, 0, 0
for node in NG.nodes():
    if node[0] == 'U':
        if node[1:] not in users:
            users[node[1:]] = {'UId': user_Id}
            user_Id += 1
    elif node[0] == 'A':
        answers[node[1:]]['AId'] = answer_Id
        answer_Id += 1
    elif node[0] == 'Q':
        questions[node[1:]]['QId'] = question_Id
        question_Id += 1

4509 5275
Counter({('A', 'Q'): 2343, ('A', 'U'): 2343, ('Q', 'U'): 589})


In [40]:
def Id_ori2new(ori_Id):
    if ori_Id[0] == 'U':
        return users[ori_Id[1:]]['UId'] + question_Id + answer_Id
    elif ori_Id[0] == 'A':
        return answers[ori_Id[1:]]['AId']
    elif ori_Id[0] == 'Q':
        return questions[ori_Id[1:]]['QId'] + answer_Id
    
with open('alledges.txt','w',encoding = 'utf-8') as ef:
    for n1, n2 in  NG.edges():
        ef.write("{}\t{}\n".format(Id_ori2new(n1), Id_ori2new(n2)))
print('saved')

saved


In [41]:
with open('alltypes.txt','w',encoding = 'utf-8') as tf:
    tf.write("0\t{}\t{}\n".format(0, answer_Id-1))
    tf.write("1\t{}\t{}\n".format(answer_Id, answer_Id-1 + question_Id))
    tf.write("2\t{}\t{}\n".format(answer_Id+question_Id, answer_Id-1 + question_Id + user_Id))
print('saved')

saved


In [42]:
acc_count = 0
for que in questions.keys():
    if 'AcceptedAnswerId' in questions[que]:
        acc_Id = questions[que]['AcceptedAnswerId']
        answers[acc_Id]['Accepted'] = True
        acc_count += 1

with open('answersLabelAcception.txt','w',encoding = 'utf-8') as lf:
    for n in NG.nodes():
        if n[0] == 'A':
            
            if not answers[n[1:]]['Accepted']:
                lf.write("{}\t0\n".format(answers[n[1:]]['AId']))
            else:
                lf.write("{}\t1\n".format(answers[n[1:]]['AId']))
print('saved')
print(answer_Id, acc_count)

saved
2343 659


In [45]:
import math
import numpy as np
scorecount = {}
aidlist = []
scorelist = []

for n in NG.nodes():
    if n[0] == 'A':
        aidlist.append(answers[n[1:]]['AId'])
        scorelist.append(int(answers[n[1:]]['Score']))
df = pd.DataFrame.from_dict({'aid': aidlist, 'score': scorelist})
space = [df.score.min()-1]
scorelistcp = sorted(scorelist)
N = 4
rg = int(len(scorelistcp) / N)+1
for i in range(1,N):
    space.append(scorelistcp[rg*i])
space.append(df.score.max())
#print(scorelistcp)
print(space)
df['label']=pd.cut(df['score'],bins=space,labels=list(range(N)))
print(df.label.value_counts())

with open('answersLabelScore.txt','w',encoding = 'utf-8') as lf:
    for DFId, ansrow in df.iterrows():
        lf.write("{}\t{}\n".format(ansrow[0], ansrow[2]))


[-6, 1, 2, 5, 45]
0    760
2    685
3    453
1    445
Name: label, dtype: int64


In [21]:
import math
import numpy as np
scorecount = {}
qidlist = []
scorelist = []

for n in NG.nodes():
    if n[0] == 'Q':
        qidlist.append(questions[n[1:]]['QId'] + answer_Id)
        scorelist.append(int(questions[n[1:]]['Score']))
df = pd.DataFrame.from_dict({'qid': qidlist, 'score': scorelist})
space = [df.score.min()-1]
scorelistcp = sorted(scorelist)
N = 3
rg = int(len(scorelistcp) / N)+1
for i in range(1,N):
    space.append(scorelistcp[rg*i])
space.append(df.score.max())
print(space)
df['label']=pd.cut(df['score'],bins=space,labels=list(range(N)))
print(df.label.value_counts())

with open('questionsLabelScore.txt','w',encoding = 'utf-8') as lf:
    for DFId, querow in df.iterrows():
        lf.write("{}\t{}\n".format(querow[0], querow[2]))

[-8, 4, 7, 67]
0    422
1    296
2    287
Name: label, dtype: int64


In [22]:
with open('userInfo.txt','w',encoding = 'utf-8') as uf:
    for usr in users.keys():
        uf.write("{}\t{}\n".format(users[usr]['UId']+answer_Id+question_Id, usr))

In [29]:
import math
import numpy as np
with open('Users.xml', 'r') as xml_file:
    tree = ET.parse(xml_file)
    userroot = tree.getroot()
    for userrow in userroot:
        user = userrow.attrib
        user_ori_Id  = user.pop('Id')
        reputation = user.pop('Reputation')
        if user_ori_Id in users.keys():
            users[user_ori_Id]['Repu'] = int(reputation)
uidlist = []
repuList = []
for n in NG.nodes():
    if n[0] == 'U':
        uidlist.append(users[n[1:]]['UId']+answer_Id+question_Id)
        repuList.append(users[n[1:]]['Repu'])
df = pd.DataFrame.from_dict({'uid': uidlist, 'repu': repuList})
space = [df.repu.min()-1]
repulistcp = sorted(repuList)
N = 3
rg = int(len(repulistcp) / N)+1
for i in range(1,N):
    space.append(repulistcp[rg*i])
space.append(df.repu.max())
print(space)
df['label']=pd.cut(df['repu'],bins=space,labels=list(range(N)))
print(df.label.value_counts())
with open('userReputation.txt','w',encoding = 'utf-8') as lf:
    for DFId, userrow in df.iterrows():
        lf.write("{}\t{}\n".format(userrow[0], userrow[2]))

[0, 41, 151, 9345]
0    398
1    396
2    367
Name: label, dtype: int64
