In [1]:
import math
import re
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt

from scipy.optimize import curve_fit, least_squares
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import functools
import multiprocessing

In [2]:
news = pd.read_csv("../news_temp.csv")
news = news.sort_values("timestamp")
news["text"] = news["text"].astype(str)
news["original_url"] = news["original_url"].str.strip()
news = news[~news["original_url"].isin({"https://sero.gcloud.api.no:443/", "http://sero.gcloud.api.no/"})]
print(len(news))

21841


In [3]:
schemes = ["dice", "cosine", "jaccard"]

def similarity_coeffs(A, B):
    intersect = A & B
    absA = sum(A.values())
    absB = sum(B.values())
    absI = sum(intersect.values())
    dice = 2 * absI / (absA + absB)
    cosine = absI / (math.sqrt(absA * absB))
    jacc = absI / (absA + absB - absI)
    return dice, cosine, jacc

def get_all_diffs(df, adjust=False):
    diffs = {
        scheme: np.eye(len(df) or None, dtype=float)
        for scheme in schemes
    }
    counters = [Counter(re.split("\\W+", s.lower())) for s in df["text"]]
    const = functools.reduce(Counter.__and__, counters) if adjust else Counter()
    for i, c1 in enumerate(counters):
        for j, c2 in enumerate(counters[:i]):
            sims = similarity_coeffs(c1-const, c2-const)
            for scheme, sim in zip(schemes, sims):
                diffs[scheme][i][j] = sim
                diffs[scheme][j][i] = sim
    return diffs

def g(x, b, c):
    return (1 - c) * b ** np.abs(x) + c

def get_params(diffs, timestamps):
    params = {}
    fitness = {}
    xg = [(ts1 - ts0) / 3600 for ts0 in timestamps for ts1 in timestamps]
    for scheme, diff in diffs.items():
        yg = diff.flatten()
        g_params = curve_fit(g, xg, yg, p0=[1, 0], bounds=(0, 1))[0]
        params[scheme] = g_params
        fitness[scheme] = r2_score(yg, g(xg, *g_params))
    return params, fitness

def get_info(website):
    df = news[news["original_url"] == website]
    diffs = get_all_diffs(df)
    params, fitness = get_params(diffs, df["timestamp"].array)
    
    info = {
        scheme: {
            "diffs": diffs[scheme],
            "params": params[scheme],
            "fitness": fitness[scheme]
        } for scheme in schemes
    }
    info["timestamps"] = df["timestamp"].array
    return info

In [None]:
pool = multiprocessing.Pool()
names = news["original_url"].unique()
info_dict = dict(zip(names, pool.map(get_info, names)))
pool.close()
# info_dict = {
#     ws: get_info(ws)
#     for ws in news["original_url"].unique()
# }

In [None]:
vals = [info_dict[ws]["cosine"]["params"][0] for ws in info_dict]
# plt.hist(vals)
pd.DataFrame(vals, columns=["diff"]).plot.kde()
plt.savefig("density.png")

In [None]:
def plot(site, skip, scheme="cosine"):
    stamps = info_dict[site]["timestamps"]
    stamps = np.array((stamps - stamps[0]) / 3600)
    params = info_dict[site][scheme]["params"]
    diffs = info_dict[site][scheme]["diffs"]
    [plt.plot(stamps, diffs[i], color="blue") for i in range(0, len(diffs), skip)]
    [plt.plot(np.arange(stamps[0], stamps[-1], 0.01), g(np.arange(-stamps[i], stamps[-1]-stamps[i], 0.01), *params), color="orange") 
         for i in range(0, len(diffs), skip)]
    plt.show()
plot(next(info_dict), 12, "cosine")

In [None]:
sort_fitness = sorted([(ws, info_dict[ws]["cosine"]["fitness"]) for ws in info_dict], key=lambda x: x[::-1])
print(f"{sort_fitness[:10]}\n...\n{sort_fitness[10::35]}\n...\n{sort_fitness[-10:]}")