In [1]:
import pandas as pd
import re
import dateutil.parser
import matplotlib.pyplot as plt
import bisect
from collections import Counter

In [2]:
df = pd.read_csv("res/news_temp.csv")
df = df.sort_values(["original_url", "timestamp"])
df.describe()

Unnamed: 0,timestamp,last_modified,norvegica_score,language.text_bytes_found,language.details.0.percent,language.details.0.score,language.details.1.percent,language.details.1.score,language.details.2.percent,language.details.2.score,...,regex.county.unique,regex.county.total,regex.name.unique,regex.name.total,regex.norway.unique,regex.norway.total,regex.kroner.unique,regex.kroner.total,regex.email.unique,regex.email.total
count,34851.0,2887.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,...,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0,34851.0
mean,1592161000.0,1588054000.0,0.937802,7182.514964,97.593268,851.179306,0.471005,86.024131,0.016728,5.202519,...,0.964563,2.746291,1.623856,2.161746,0.891337,2.287653,0.716077,0.911882,1.268744,1.377608
std,127978.3,12309700.0,0.093707,8866.323079,8.257668,114.749371,1.888282,246.981003,0.222755,60.176088,...,2.053329,5.59447,2.635788,3.850347,1.036043,4.701284,0.992101,1.459176,2.467516,2.594657
min,1591942000.0,1538043000.0,0.159104,43.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1592050000.0,1591986000.0,0.928206,3000.0,99.0,803.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1592161000.0,1592118000.0,0.958144,4400.0,99.0,859.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,1592273000.0,1592240000.0,0.975458,8996.0,99.0,904.0,0.0,0.0,0.0,0.0,...,1.0,3.0,2.0,3.0,1.0,2.0,1.0,1.0,2.0,2.0
max,1592381000.0,1592389000.0,0.997255,118874.0,99.0,3390.0,48.0,1752.0,6.0,1098.0,...,37.0,60.0,30.0,39.0,5.0,47.0,11.0,15.0,22.0,23.0


In [61]:
def convert_dt(dt):
    if pd.isna(dt):
        return None
    elif isinstance(dt, (int, float)):
        return int(dt) - 7200
    dt = str(dt)
    try :
        return int(dateutil.parser.parse(re.sub("(man|tirs?|ons|tors?|fre|lør|søn)(dag)?", "", dt)).timestamp())
    except Exception:
        return None

def jaccard(c0, c1):
    i = c0 & c1
    aA = sum(c0.values())
    aB = sum(c1.values())
    aI = sum(i.values())
    return aI / (aA + aB - aI)

def check_lm(group):
    counters = [Counter(re.split("\\W+", str(txt))) for txt in group.text]
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(len(group) - 1):
        row0 = group.iloc[i]
        row1 = group.iloc[i+1]
        
        ts0 = row0.timestamp
        ts1 = row1.timestamp
        
#         lm0 = convert_dt(row0.last_modified)
        lm1 = convert_dt(row1.last_modified)
        
        sim = jaccard(counters[i], counters[i+1])
        
        if sim < 1: # Significant change
#             print(ts0, lm1, ts1)
            if lm1 and ts0 < lm1 < ts1:
                tp += 1
            else:
                fn += 1
        else:
            if lm1 and ts0 < lm1 < ts1:
                fp += 1
            else:
                tn += 1
    return tp, fp, tn, fn

def check_etag(group):
    counters = [Counter(re.split("\\W+", str(txt))) for txt in group.text]
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(len(group) - 1):
        row0 = group.iloc[i]
        row1 = group.iloc[i+1]
        
        et0 = str(row0.etag).replace("W/", "")
        et1 = str(row1.etag).replace("W/", "")
        
        sim = jaccard(counters[i], counters[i+1])
        
        if sim < 1: # Significant change
            if et0 != et1:
                tp += 1
            else:
                fn += 1
        else:
            if et0 != et1:
                fp += 1
            else:
                tn += 1
    return tp, fp, tn, fn

def plot_lm(site, group):
    fig, ax = plt.subplots()
    ax.set_title(site)

    counters = [Counter(re.split("\\W+", str(txt))) for txt in group.text]
    diffs = [1] + [jaccard(c0, c1) for c0, c1 in zip(counters, counters[1:])]

    lm = [convert_dt(dt) for dt in group.last_modified.unique()]
    lm = [l for l in lm if l]
    [ax.axvline(m) for m in lm]
    
    ax.plot(group.timestamp, diffs, color="red")
    return fig, len(lm)



In [62]:
import random
tots = [0, 0, 0, 0]
for wp, group in df.groupby("original_url"):
#     fig, n_breaks = plot_lm(wp, group)
#     if n_breaks >= 2:
#     n_times = len(group["timestamp"].unique())
    if (group.last_modified.isna().sum()) == 0:
#         if wp == "https://klassekampen.no:443/":
#             plot_lm(wp, group)[0].show()
        cg = check_etag(group)
        for i in range(len(tots)):
            tots[i] = tots[i] + cg[i]
        print(tots, cg, wp)
#         if random.random() > 0.9:
#             plot_lm(wp, group)
#             break
#         break
#     plt.close(fig)
#     plt.cla()
#     plt.clf()


[0, 0, 78, 44] (0, 0, 78, 44) http://drm24.no/
[71, 51, 78, 44] (71, 51, 0, 0) http://e24.no/
[71, 51, 200, 44] (0, 0, 122, 0) http://www.byavisatonsberg.no/
[97, 57, 290, 44] (26, 6, 90, 0) http://www.dagen.no/
[99, 57, 410, 44] (2, 0, 120, 0) http://www.dinepenger.no/
[217, 58, 412, 44] (118, 1, 2, 0) http://www.fanaposten.no/
[217, 58, 533, 45] (0, 0, 121, 1) http://www.friheten.no/
[217, 58, 577, 123] (0, 0, 44, 78) http://www.gausdolen.no/
[217, 58, 648, 123] (0, 0, 71, 0) http://www.norwaypost.com/
[217, 58, 770, 123] (0, 0, 122, 0) http://www.solungavisa.no/
[241, 152, 774, 123] (24, 94, 4, 0) http://www.ukeavisenledelse.no/
[241, 152, 872, 147] (0, 0, 98, 24) http://www.utrop.no/
[241, 152, 874, 267] (0, 0, 2, 120) http://www.vg.no/
[343, 153, 892, 267] (102, 1, 18, 0) http://www.ytringen.no/
[371, 178, 961, 267] (28, 25, 69, 0) https://morgenbladet.no/
[371, 178, 1082, 268] (0, 0, 121, 1) https://nyttiuka.no/
[371, 178, 1140, 332] (0, 0, 58, 64) https://opp.no/
[395, 272, 1144