In [1]:
import requests
import json
import hashlib
from bs4 import BeautifulSoup

In [2]:
def query(params: dict) -> dict:
    url = "https://en.wikipedia.org/w/api.php"
    params["format"] = "json"
    response_json = requests.get(url, params=params).json()
    return response_json

def compute_md5(s) -> str:
    return hashlib.md5(str(s).strip().encode('utf-8')).hexdigest()

In [3]:
def get_n_revisions(title: str, n: int, first_or_last: str) -> list:
    revs = []
    params = {}
    params["action"] = "query"
    params["prop"] = "revisions"
    params["titles"] = title
    params["rvprop"] = "ids|timestamp|user|userid|parsedcomment|comment|roles"
    params["rvdir"] = "newer" if first_or_last == "first" else "older"
    params["rvlimit"] = n
    params["formatversion"] = "2"
    
    # handles continuation when asking for more than 50 revisions
    while len(revs) < n:
        response = query(params)
        i = 0
        while len(revs) < n and i < len(response["query"]["pages"][0]["revisions"]):
            revs.append(response["query"]["pages"][0]["revisions"][i])
            i += 1
        params["rvcontinue"] = response["continue"]["rvcontinue"]
    
    return revs

def get_revisions_between_ids(title: str, fromid: int, toid: int) -> list:
    revs = []
    params = {}
    params["action"] = "query"
    params["prop"] = "revisions"
    params["titles"] = title
    params["rvprop"] = "ids|timestamp|user|userid|parsedcomment|comment|roles"
    params["rvdir"] = "newer"
    params["rvstartid"] = fromid
    params["rvendid"] = toid
    params["formatversion"] = "2"
    
    response = query(params)
    
    # handles continuation when asking for more than 50 revisions
    i = 0
    while len(revs) == 0 or revs[-1]["revid"] < toid:
        response = query(params)
        revs.append(response["query"]["pages"][0]["revisions"][i])
        if "continue" in response:
            params["rvcontinue"] = response["continue"]["rvcontinue"]
            print("More than 50 revisions requested. Sending another request...")
        i += 1
    
    return revs

In [4]:
a = get_n_revisions("Talk:Philosophy", 10, "first")
for el in a:
    print(el["revid"], el["parentid"])

370558637 370558591
370558638 370558591
273853 370558638
273854 273853
47328 273854
47331 47328
47534 47331
55470 47534
137104 55470
217200 137104


In [5]:
def get_revision_diff(title: str, fromid: int, toid: int) -> dict:
    params = {}
    params["action"] = "compare"
    params["fromrev"] = fromid
    params["torev"] = toid
    return query(params)

def compute_text_depth(text: str) -> int:
    d = 0
    while text[d] == ":":
        d += 1
    return d

def compute_reply_hash(accum: dict, reply_to_hash: str, reply_to_depth: int, this_depth: int) -> str:
    if this_depth == 0: 
        return None
    elif this_depth > reply_to_depth:
        return reply_to_hash
    else:
        while reply_to_depth > this_depth:
            try:
                reply_to_hash = accum["hash_lookup"][accum[reply_to_hash]["reply_to"]]
                reply_to_depth -= 1
            except:
                # in the case that a high level comment is not stored
                return None
    
def find_ultimate_hash(accum: dict, h: str) -> str:
    while accum["hash_lookup"][h] != h:
        h = accum["hash_lookup"][h]
    return h

def is_unedited_block(all_td: list) -> bool:
    return len(all_td) == 4 and all_td[0] == all_td[2]

def is_new_section_text(added_text: str) -> bool:
    return ((added_text[:3] == "===" and added_text[-3:] == "===") or \
            (added_text[:2] == "==" and added_text[-2:] == "=="))

def is_new_content_block(all_td: list) -> bool:
    return (len(all_td) == 3 and all_td[0]["class"][0] == "diff-empty" and all_td[2]["class"][0] == "diff-addedline")

def is_removal_block(all_td: list) -> bool:
    return (len(all_td) == 3 and all_td[1]["class"][0] == "diff-deletedline" and all_td[2]["class"][0] == "diff-empty")

def is_modification_block(all_td: list) -> bool:
    return (len(all_td) == 4 and all_td[1]["class"][0] == "diff-deletedline" and all_td[3]["class"][0] == "diff-addedline")

def is_line_number_block(all_td: list) -> bool:
    return (len(all_td) == 2 and all_td[0]["class"][0] == "diff-lineno" and all_td[1]["class"][0] == "diff-lineno")


# How does this script handle modification? 
#   Part of the accumulation dictionary is the "hash_lookup" table. This table stores the modified hashes of 
#   every block that has been added to accum and not removed. I may or may not decide to remove hashes on 
#   removal from the thread (likely will). This allows for consistency of information across revisions, however
#   rare editing a comment is. Additionally, each dictionary in accum keeps track of which revisions edited it,
#   whose length should be equal to the depth in hash_lookup

def parse_diff(edits: list, diff: dict, accum: dict={'hash_lookup': {}, 'revisions': {}, 'blocks': {}}) -> dict:

    if accum == {}:
        accum["hash_lookup"] = {}
        accum["revisions"] = {}
        accum["blocks"] = {}
        
    soup = BeautifulSoup(diff["compare"]["*"])
    this_rev = {}
    hashed_text, block_depth, last_hash, last_depth = None, None, None, None
    last_block_was_ingested = False
    behavior = []
    for tr in soup.find_all("tr")[1:]:
        all_td = tr.find_all("td")
        block = {}
        
        if is_unedited_block(all_td):  # no edit in this block [ASSUMPTION. Must test with modification of a single block]   
            assert(all_td[1].get_text() == all_td[3].get_text())
            unedited_text = str(all_td[1].get_text())
            if len(unedited_text) > 0:
                hashed_text = compute_md5(unedited_text)
                block_depth = compute_text_depth(unedited_text)
                if hashed_text not in accum["blocks"]: # this old block has not yet been added to accum
                    block["text"] = unedited_text
                    block["timestamp"] = edits[0]["timestamp"]
                    block["user"] = None
                    block["ingested"] = False
                    block["revisions"] = ["unknown"]
                    block["reply_chain"] = [hashed_text]
                    accum["blocks"][hashed_text] = block
                    accum["hash_lookup"][hashed_text] = hashed_text
                else:
                    # unchanged block has already been added to accum
                    pass
                last_hash = hashed_text
                last_depth = block_depth
                last_block_was_ingested = False
            else:
                # unchanged block is empty, do not need to record
                pass
            
            
        elif is_new_content_block(all_td): # block includes new content
            added_text = str(all_td[2].get_text())
            hashed_text = compute_md5(added_text)
            if len(added_text) > 0:
                block["text"] = added_text
                block["timestamp"] = edits[1]["timestamp"]
                block["user"] = edits[1]["user"]
                block["ingested"] = True
                block["revisions"] = [edits[1]["revid"]]

                if is_new_section_text(added_text):
                    behavior.append("create_section")
                    block["reply_chain"] = [hashed_text]
                else:
                    behavior.append("add_comment")
                    block_depth = compute_text_depth(added_text)
                    if last_block_was_ingested:
                        block["reply_chain"] = accum["blocks"][last_hash]["reply_chain"].copy()
                        block["reply_chain"].append(hashed_text)
                        accum["blocks"][last_hash]["is_followed"] = True
                    else:
                        reply_to_hash = compute_reply_hash(accum, last_hash, last_depth, block_depth)
                        if reply_to_hash is not None:
                            block["reply_chain"] = accum["blocks"][reply_to_hash]["reply_chain"].copy()
                            block["reply_chain"].append(hashed_text)
                        else:
                            block["reply_chain"] = [hashed_text]  
                
                accum["blocks"][hashed_text] = block
                accum["hash_lookup"][hashed_text] = hashed_text
                last_hash = hashed_text
                last_depth = block_depth
                last_block_was_ingested = True
            else:
                pass
            
        elif is_removal_block(all_td):    # block is removing some earlier block
            removed_text = str(all_td[1].get_text())
            if len(removed_text) > 0:
                hashed_removal = compute_md5(removed_text)
                try:
                    del accum["blocks"][hashed_removal]    # removes the comment from the record of utterances
                    del accum["hash_lookup"][hashed_removal]
                except KeyError:
                    print("-------------------------------")
                    print("did not have comment on record: ")
                    print(removed_text)
                    print("-------------------------------")
        
        elif is_modification_block(all_td):
            old_text = str(all_td[1].get_text())
            old_hash = compute_md5(old_text)
            new_text = str(all_td[3].get_text())
            new_hash = compute_md5(new_text)
            behavior.append("modify")
            
            if old_hash in accum["blocks"]:
                assert(old_hash in accum["hash_lookup"])
                # NOTE: does not touch "ingested" or "reply_chain" elements of dictionary
                block = accum["blocks"].pop(old_hashed)
                block["text"] = new_text
                block["timestamp"] = edits[1]["timestamp"]
                block["user"] = edits[1]["user"]
                block["revisions"].append(edits[1]["revid"])
                accum["blocks"][new_hash] = block
                accum["hash_lookup"][new_hash] = new_hash
                accum["hash_lookup"][old_hash] = new_hash
            else:
                # weird case, someone edits comment that hasn't been seen
                assert(old_hash not in accum["hash_lookup"])
                block = {}
                block["text"] = new_text
                block["timestamp"] = edits[1]["timestamp"]
                block["user"] = edits[1]["user"]
                block["ingested"] = False
                block["revisions"] = ["unknown", edits[1]["revid"]]
                block["reply_chain"] = [new_hash]
                accum["blocks"][new_hash] = block
                accum["hash_lookup"][new_hash] = new_hash      
        elif not is_line_number_block(all_td):
            print(all_td)
            raise Exception("block has unknown behavior")
            
            
    this_rev["behavior"] = behavior
    revid = edits[1]["revid"]
    accum["revisions"][revid] = this_rev
    
    return accum

In [6]:
def print_accum(accum: dict) -> None:
    for k, v in accum.items():
        if k == "hash_lookup":
            print("hash_lookup----------------")
            for k2, v2 in v.items():
                print(k2 + ": " + v2)
            print("---------------------------\n")
            
        if k == "blocks":
            print("blocks---------------------")
            for k2, v2 in v.items():
                print("\n")
                print(k2)
                for k3, v3 in v2.items():
                    print(k3, ":", v3)
            print("---------------------------")
        
        if k == "revisions":
            print("revisions------------------")
            for k2, v2 in v.items():
                print(k2, ":", v2)
            print("---------------------------\n")

[Simple response to comment](https://en.wikipedia.org/w/index.php?diff=3561919&oldid=3553760&title=Talk:Philosophy)

In [38]:
edit1 = 3553760
edit2 = 3561919

edits = get_revisions_between_ids("Talk:Philosophy", edit1, edit2)
diff = get_revision_diff("Talk:Philosophy", edit1, edit2)
parsed = parse_diff(edits, diff)
# print_accum(parsed)

remove_edit1 = 3112925
remove_edit2 = 3112956

[Slightly coarse discourse - demonstrating correct recognition of reply, depth, and old comments](https://en.wikipedia.org/w/index.php?diff=4644488&oldid=4644398&title=Talk:Philosophy)

In [39]:
edit3 = 4644398
edit4 = 4644478

edits = get_revisions_between_ids("Talk:Philosophy", edit3, edit4)
diff = get_revision_diff("Talk:Philosophy", edit3, edit4)
parsed = parse_diff(edits, diff)
# print_accum(parsed)

print("\n")
print("--------------------------------------------------------------------")
print("\n")

edit5 = 4644488

edits = get_revisions_between_ids("Talk:Philosophy", edit4, edit5)
diff = get_revision_diff("Talk:Philosophy", edit4, edit5)
parsed = parse_diff(edits, diff)
# print_accum(parsed)



--------------------------------------------------------------------




[Adding russian then having it removed - mildly bad behavior](https://en.wikipedia.org/w/index.php?diff=919321798&oldid=917895729&title=Talk:Philosophy)

In [43]:
before_russian = 917895729
adds_russian = 919321798
removes_russian = 922496879
removes_more = 922496879

edits = get_revisions_between_ids("Talk:Philosophy", before_russian, adds_russian)
diff = get_revision_diff("Talk:Philosophy", before_russian, adds_russian)
parsed = parse_diff(edits, diff)
# print(parsed)

In [42]:
edits = get_revisions_between_ids("Talk:Philosophy", adds_russian, removes_russian)
diff = get_revision_diff("Talk:Philosophy", adds_russian, removes_russian)
parsed = parse_diff(edits, diff)
# print(parsed)

-------------------------------
did not have comment on record: 
== Досуговые измышления. С чашкой. Чая. На пример ..  ==
-------------------------------
-------------------------------
did not have comment on record: 
"В Научном Мире" (старом) это слово называлось "гипотеза". 
-------------------------------
-------------------------------
did not have comment on record: 
Сейчас - "корреляция". В US требование к высоте потенциального барьера над неопределённостью типа "50%/50%" - 25%. 
-------------------------------
-------------------------------
did not have comment on record: 
У нас (в SU) .. в RU ? .. скоромно - 1%. Факт конешно упрямая вещь .. Но уровень инфлюэнций ожидаетца порядка 4% .. 
-------------------------------
-------------------------------
did not have comment on record: 
Хрен знат .. может и сделают перерасчёт пенсионов как обещали.  Хм .. ?? Ну и ладно! Вот и хорошо !! .. [[Special:Contributions/85.140.16.27|85.140.16.27]] ([[User talk:85.140.16.27|talk]]) 02:34, 

craft may have a problem with seeing multiple comments from one user in a row - it would be better to make the multiple blocks into one comment

In [18]:
def process_between_revisions(title: str, fromid: int, toid: int) -> dict:
    assert(fromid != toid)
    res = {}
    revisions = get_revisions_between_ids(title, fromid, toid)
    i = 1
    while i < len(revisions):
        last_rev = revisions[i-1]
        curr_rev = revisions[i]
        diff = get_revision_diff(title, last_rev["revid"], curr_rev["revid"])
        res = parse_diff([last_rev, curr_rev], diff, res)
        i += 1
    return res

a = get_n_revisions("Talk:BMW", 10, "first")
print("from:",a[0]["revid"], "to:",a[-1]["revid"])

res = process_between_revisions("Talk:BMW", 2598506, 9670552)
print_accum(res)

from: 2598506 to: 9670552
hash_lookup----------------
627a7e73df2711c1638b440538240dbc: 627a7e73df2711c1638b440538240dbc
baea044d32c787353f49b168a215e098: baea044d32c787353f49b168a215e098
0c0994fbd69e70162dfec4d84720a6ce: 0c0994fbd69e70162dfec4d84720a6ce
18638d41e27ba80935fc902c6cda04b4: 18638d41e27ba80935fc902c6cda04b4
8f1b8edc1950d2fbc708744ed21998d5: 8f1b8edc1950d2fbc708744ed21998d5
e7aa9ebae9200b3f07a0d71015a86f25: e7aa9ebae9200b3f07a0d71015a86f25
fd7c613608c2b8d0374eafe7f1315e5d: fd7c613608c2b8d0374eafe7f1315e5d
84a722f8aa83d758ba184693b5bf18ab: 84a722f8aa83d758ba184693b5bf18ab
6bc819c14a4fd9ca1f2c4615d76afe7e: 6bc819c14a4fd9ca1f2c4615d76afe7e
23153fe236975b6406002d4fcf2fe58f: 23153fe236975b6406002d4fcf2fe58f
fa8393644a675f496550df1b4284f005: fa8393644a675f496550df1b4284f005
e72d7bfdfb81cbec2b02aec454621144: e72d7bfdfb81cbec2b02aec454621144
7bd335b38f376df2c6091013b63ff309: 7bd335b38f376df2c6091013b63ff309
00405040d54fbbe70ff606b2b56cedd4: 00405040d54fbbe70ff606b2b56cedd4
31c4fd35

[Badly behaved: commenting and creating new section after the comment](https://en.wikipedia.org/w/index.php?title=Talk:Philosophy&diff=next&oldid=3112956)


In [19]:
from convokit import Corpus, User, Utterance
from collections import deque

In [85]:
# # not in use
def segment_contiguous_blocks(accum: dict, reply_chain: list) -> list:
    if len(reply_chain) == 1:
        return [[find_ultimate_hash(accum, reply_chain[0])]]
    res = []
    last_h = find_ultimate_hash(accum, reply_chain[0])
    last_user = accum["blocks"][last_h]["user"]
    contig = [last_h]
    for block in reply_chain[1:]:
        this_h = find_ultimate_hash(accum, block)
        this_user = accum["blocks"][this_h]["user"]
        if this_user == last_user:
            contig.append(block)
        else:
            res.append(contig)
            contig = [this_h]
        last_h = this_h
        last_user = this_user
    if len(contig) > 0:
        res.append(contig)
    return res

def string_of_seg(seg: list) -> str:
    return ' '.join(seg)

def print_segments(segments: list) -> None:
    s = ""
    for se in segments:
        for h in se:
            s += h[:8] + " "
        s += "| "
    print(s)

def compute_utt_id_from_block_hashes(hashes: list, accum: dict) -> str:
    # will improve later for defensibility against edits
    return hashes[0]
     
def find_reply_to(segment: list) -> str:
    if len(segment) == 1:
        return None
    else:
        return segment[-2][0]
    
def convert_intermediate_to_convokit(accum: dict) -> Corpus:
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum["blocks"].items():
        if block["user"] not in users:
            users[block["user"]] = User(name = block["user"])
        segments = segment_contiguous_blocks(accum, block["reply_chain"])
        
        for seg in segments[:-1]:
            sos = string_of_seg(seg)
            complete_utterances.add(sos)
       
        assert(block_hash == segments[-1][-1])
        if "is_followed" not in accum["blocks"][segments[-1][-1]]:
            complete_utterances.add(string_of_seg(segments[-1]))
        block_hashes_to_segments[block_hash] = segments
#         print_segments(segments)
        
#     for el in iter(complete_utterances):
#         t = ""
#         for h in el.split(" "):
#             t += accum["blocks"][h]["text"][:15] + " + "
#         print(t)
    
    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum["blocks"][block_hashes[0]]
        
        
        u_id = block_hashes[0]
        u_user = users[first_block["user"]]
        u_root = belongs_to_segment[0][0]
        u_replyto = find_reply_to(belongs_to_segment)
        u_timestamp = first_block["timestamp"]
        u_text = "\n".join([accum["blocks"][h]["text"] for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes
        
        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id
            
        this_utterance = Utterance(u_id, u_user, u_root, u_replyto, u_timestamp, u_text)
        this_utterance.meta = u_meta
        
        utterances.append(this_utterance)
    
    corpus = Corpus(utterances=utterances)         
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids
    
    return corpus

In [86]:
c = convert_intermediate_to_convokit(res)

In [87]:
c.get_conversation_ids()

['6bc819c14a4fd9ca1f2c4615d76afe7e',
 'fa8393644a675f496550df1b4284f005',
 'baea044d32c787353f49b168a215e098',
 'e7aa9ebae9200b3f07a0d71015a86f25',
 '0ffecc1aaed0428a94af5419cf5798f7',
 '0c0994fbd69e70162dfec4d84720a6ce',
 '627a7e73df2711c1638b440538240dbc']

In [90]:
for conv in c.iter_conversations():
    indent = ""
    for utt in conv.iter_utterances():
        print(indent + utt.text)
        indent += " - "
    print("\n")

== Beemers ==
Here in the UK, the word "Beemer" is commonly used about the cars; I've virtually never seen "Bimmer" here. [[User:Loganberry|Loganberry]] 14:36, 30 Oct 2004 (UTC)
 - :Same goes in Australia. - [[User:Vague Rant|Vague]] | [[User talk:Vague Rant|Rant]] 02:20, Dec 19, 2004 (UTC)
 -  - :: Sounds like it's regional, clearly.  In the US, that distinction goes back to the mid 70s, at least.  I'll clarify it. -- [[User:Baylink|Baylink]] 21:26, 19 Dec 2004 (UTC)


== The BMW Bavaria? ==
Maybe I should just do some research and add it myself, but nowhere in the article is the BMW Bavaria mentioned.  This model was the immediate predecessor to the 5-series, and was seen at the time (mid 70s) as combining the features of the 3.0 CS and another model I've forgotten, at a much cheaper price.  My dad had one, but being a toddler at the time my memory's a little fuzzy, and he wouldn't even let me drive it.
[[User:TJSwoboda|TJSwoboda]] 19:03, 7 Nov 2004 (UTC)
 - : The history section cou