In [1]:
from pathlib import Path
import random

import pandas as pd
import statsmodels.formula.api as smf

from story_construction import MilvusClient, download_news, filter_event_from_important_sources

In [2]:
milvus_client = MilvusClient(reset_ds=False)
milvus_client.collection.load()

In [3]:
def load_df(df_name):
    df = pd.read_csv(df_name)
    return filter_event_from_important_sources(df)

In [4]:
event_meta_df01 = load_df("20220201.csv")
event_meta_df02 = load_df("20220202.csv")
event_meta_df03 = load_df("20220203.csv")

In [5]:
history_df = pd.concat([event_meta_df01, event_meta_df02], ignore_index=True)

In [6]:
all_df =  pd.concat([event_meta_df01, event_meta_df02, event_meta_df03], ignore_index=True)

In [53]:
all_df.DATEADDED

0       20220201000000
1       20220201000000
2       20220201000000
3       20220201000000
4       20220201000000
             ...      
4049    20220203234500
4050    20220203234500
4051    20220203234500
4052    20220203234500
4053    20220203234500
Name: DATEADDED, Length: 4054, dtype: int64

In [7]:
def search(history_df, search_df, milvus_client):
    search_ids = search_df.GLOBALEVENTID.to_list()
    history_ids = history_df.GLOBALEVENTID.to_list()
    data = milvus_client.query(f"global_event_id in {search_ids}", ["global_event_id", "embeddings"])
    embeddings2search = [x['embeddings'] for x in data]
    
    search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
    results = milvus_client.collection.search(
        data=embeddings2search, 
        anns_field="embeddings", 
        param=search_params, 
        limit=10, 
        expr=f"global_event_id in {history_ids}",
        consistency_level="Strong")
    
    return data, results

In [8]:
data, results = search(history_df, event_meta_df03, milvus_client)

In [10]:
def hypo_test(eid0, eids, milvus_client):
    eids = [eid for eid in eids if eid != eid0]
    data0 = milvus_client.query(f"global_event_id == {eid0}", ["embeddings"])
    data = milvus_client.query(f"global_event_id in {eids}", ["embeddings"])
    data0 = [x['embeddings'] for x in data0]
    data = [x['embeddings'] for x in data]
    
    columns = [f"embd{i}" for i in range(len(data) + 1)]
    df = pd.DataFrame({
        column: (data0 + data)[i] for i, column in enumerate(columns)
    })
    expr = f"embd0 ~ {' + '.join(columns[1:])}"
    results = smf.ols(expr, data=df).fit()
    
    return results

In [11]:
def print_event_title(global_event_id):
    root_dir = Path("/Volumes/Extreme SSD/news_archive")
    with open(root_dir/f"{global_event_id}.txt") as f:
        title = f.read().strip()
    print(f"{global_event_id}: {title}")

def see(data, results, event_meta_df, milvus_client):
    idx = random.randint(0, len(data)-1)
    eid0 = data[idx]["global_event_id"]
    print_event_title(eid0)
    print("-.-.-"*3)
    hit = results[idx]
    for i, eid in enumerate(hit.ids):
        if eid == eid0:
            continue
        print_event_title(eid)
        print(f"dis: {hit.distances[i]}")
        print("---"*3)
    ht_results = hypo_test(eid0, hit.ids, milvus_client)
    print(ht_results.summary())
    
    return event_meta_df[event_meta_df.GLOBALEVENTID.isin(list(hit.ids))], ht_results

In [13]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026514414: GOP senators introduce bill to prevent TSA from accepting arrest warrants as valid forms of ID
-.-.--.-.--.-.-
1026084808: Hawley seeks answers on ‘unacceptable’ TSA policy of allowing illegal immigrants to use warrants as ID
dis: 2.0483083724975586
---------
1026301087: Appeals court may overturn order keeping Burr search warrant secret
dis: 1.6962199211120605
---------
1026161419: Jan. 6 Panel Examining Trump’s Role in Proposals to Seize Voting Machines
dis: 1.5410571098327637
---------
1026290896: January 6 committee member says Trump 'absolutely' tampered with witnesses by dangling pardons for riot defendants
dis: 1.516289472579956
---------
1025985238: Teen rejects Elon Musk's $5,000 offer to shut down jet tracker
dis: 1.5035197734832764
---------
1026077028: The defense secretary tells Republican governors: National Guard troops must be vaccinated.
dis: 1.4908534288406372
---------
1026264693: Hawley calling on Biden admin to drop support for Ukraine's eventual member

In [14]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026500524: LIVE BLOG | Winter storm in Northeast Ohio: Real-time updates and traffic conditions as snow impacts the region
-.-.--.-.--.-.-
1026142618: Winter Storm Landon could cover more than 2,000 miles and at least 19 states. How are ODOT, FirstEnergy prepping?
dis: 1.4067776203155518
---------
dis: 1.338958978652954
---------
1026037233: Winter storm might bring over 9 inches of snow to Cleveland area, but other parts of Ohio could see more
dis: 1.3108537197113037
---------
1026166294: Kansas City district cancels school ahead of 'unpredictable winter weather'
dis: 1.2975029945373535
---------
1026155030: As snowstorm bears down, some schools cancel Wednesday classes; Gov. J.B. Pritzker declares disaster, activates National Guard in central Illinois
dis: 1.2007393836975098
---------
1026238564: Budowsky: Robert Kennedy for president in 2024
dis: 1.1931960582733154
---------
dis: 1.183380126953125
---------
1026243273: Airlines Cancel Flights As Winter Storm Brings Freezing Rain, S

In [16]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026361858: Iran FM congratulates new Dutch top diplomat
-.-.--.-.--.-.-
1026177158: Iran FM hopes success for China’s Winter Olympics
dis: 2.639068603515625
---------
1025985661: Iran, Pakistan reiterate formation of inclusive government in Afghanistan
dis: 1.6637144088745117
---------
1026184660: Iran and Australia discuss Afghanistan crisis
dis: 1.599618911743164
---------
1025990058: Iran senior diplomat discusses Yemen with UN envoy
dis: 1.588987112045288
---------
1026290896: January 6 committee member says Trump 'absolutely' tampered with witnesses by dangling pardons for riot defendants
dis: 1.5589346885681152
---------
1026029955: Amnesty joins other rights group in condemning Israeli 'apartheid'
dis: 1.2628040313720703
---------
1026053625: Who was Walther Rathenau, Germany's only Jewish cabinet minister?
dis: 1.2452278137207031
---------
1026331004: New Zealand reveals dates for opening borders in stages
dis: 1.2418075799942017
---------
1026087755: Wray denies FBI tougher o

In [17]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026483901: Biden says ISIS leader died by suicide bomb in 'desperate act of cowardice,' vows to hunt terrorists down
-.-.--.-.--.-.-
1026318053: National Archives says it will turn over Mike Pence's records to January 6 panel
dis: 2.432819366455078
---------
1026086606: Anders Behring Breivik, Killer in 2011 Norway Massacre, Is Denied Parole
dis: 2.202904224395752
---------
1025987385: Some Trump Documents Given To Jan. 6 Committee Had Been Torn Up
dis: 2.1819651126861572
---------
1026113527: Norway court rejects mass killer Breivik’s parole request
dis: 2.149075508117676
---------
1026218953: US to send destroyer, fighter jets to UAE amid Houthi attacks
dis: 2.0854573249816895
---------
1026331436: Pence documents to be turned over to Jan. 6 committee, National Archives says
dis: 2.0169668197631836
---------
1025989131: Feds Charge North Carolina Man With Teaching How to Make Bombs, Kill Law Enforcement
dis: 2.0058412551879883
---------
1026269017: Russia crisis exposes deep divide 

In [18]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026415802: ‘And Just Like That’ Season Finale Recap: Stiletto on the Other Foot
-.-.--.-.--.-.-
1026011598: Analysis: January 6 committee uses Pence team to try to penetrate Trump's West Wing
dis: 0.9946279525756836
---------
1026106445: Gigi Hadid to Co-Host Netflix's "Next in Fashion" Season 2
dis: 0.9669896960258484
---------
1025960516: Marc Short: Mike Pence's former chief of staff testifies in House January 6 investigation
dis: 0.9484429955482483
---------
1026181414: Whoopi Goldberg suspended from "The View" after saying the Holocaust was "not about race"
dis: 0.9464564323425293
---------
1026061344: Whoopi Goldberg suspended from "The View" after saying the Holocaust was "not about race"
dis: 0.9464564323425293
---------
1026318053: National Archives says it will turn over Mike Pence's records to January 6 panel
dis: 0.8634570837020874
---------
1026142134: White House supports permanent legal status for families separated at border
dis: 0.8494504690170288
---------
1026069885

In [19]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026392138: Mass Shooting On Greyhound Bus In Northern California Leaves 1 Dead, Several Injured
-.-.--.-.--.-.-
1026166010: Removal flights to Colombia spur Venezuelan fears of harsher immigration treatment in the US
dis: 1.5164704322814941
---------
1026323572: Bridgewater College shooting suspect appears in court, arraignment continued to Feb. 16
dis: 1.49434232711792
---------
1026063376: Federal prisons placed on temporary lockdown after gang fight leaves two dead in Texas
dis: 1.4404109716415405
---------
1025990072: No drug smuggling involved in Trooper's death
dis: 1.3480534553527832
---------
1026319972: Autonomous vehicles need stricter rules -U.S. safety group, labor unions
dis: 1.3278107643127441
---------
1026155006: SpaceX tees up next Space Coast launch for Thursday while Astra Space targets Saturday
dis: 1.2930433750152588
---------
1025990779: Scarborough rejects initial plan to bring first Costco to Maine
dis: 1.2321020364761353
---------
1025985225: CDC Warns Against

In [20]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026463749: Another rep who traveled to Ukraine tests positive for COVID-19
-.-.--.-.--.-.-
1025977242: Russia responds in writing to US ahead of Blinken-Lavrov call
dis: 1.7202837467193604
---------
1026168577: House January 6 committee member Jamie Raskin says Trump "said the criminal part out loud"
dis: 1.6490049362182617
---------
1025966002: Wisconsin Lawmakers Ask Feds to Investigate Man's Death at Israeli Checkpoint
dis: 1.6009845733642578
---------
1026161419: Jan. 6 Panel Examining Trump’s Role in Proposals to Seize Voting Machines
dis: 1.5396476984024048
---------
1026175189: House Majority Leader Steny Hoyer has 'mild symptoms' after testing positive for Covid-19
dis: 1.5107861757278442
---------
1026152992: House Majority Leader Steny Hoyer says he tested positive for COVID-19
dis: 1.446639060974121
---------
1026286651: President Biden's Commerce Sec Says She Has No Cannabis Decriminalization Updates On Her Agenda
dis: 1.4214943647384644
---------
1026326555: Jan. 6 select

In [24]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026376219: Ukraine Russia crisis: Kremlin says there was a 'mix-up' in its response to US
-.-.--.-.--.-.-
1026056468: Ukraine Russia crisis: Kremlin says there was a 'mix-up' in its response to US
dis: 3.996917486190796
---------
1026314342: Biden orders forces to Europe amid stalled Ukraine talks
dis: 2.4017114639282227
---------
1026262728: U.S. to move 3,000 troops closer to Ukraine as Russia crisis escalates
dis: 2.275588035583496
---------
1026140161: Putin continues quest to divide NATO over Ukraine
dis: 2.2688066959381104
---------
1026113658: Ukraine announces plan to boost army; US demands Russian de-escalation
dis: 2.2563045024871826
---------
1026318053: National Archives says it will turn over Mike Pence's records to January 6 panel
dis: 2.1783857345581055
---------
1026063899: As Russia sends mixed signals, Ukrainian civilians train for war
dis: 2.1325809955596924
---------
1026081991: Ukraine-Russia crisis: Ceasefire violations rise at contact line
dis: 2.130140066146850

In [27]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026437757: Jason Leitch: Latest Omicron strain should cause ‘mild’ concern
-.-.--.-.--.-.-
1026232933: Pfizer seeks FDA nod for vaccine for children as young as 6 months, and latest studies find omicron may not protect against future infection
dis: 2.514941453933716
---------
1026222898: Covid News: New Zealand Plans to Fully Reopen
dis: 2.0968031883239746
---------
1026223953: Opinion | The Clues to the Next Variant Surge Are All Around Us
dis: 1.968110203742981
---------
1026089485: Extra beds to help overwhelmed hospitals will open at four Pa. sites, including one in Philadelphia
dis: 1.9671580791473389
---------
1026263870: Emirates airline to resume flights to Nigeria on February 5
dis: 1.9542365074157715
---------
1026000625: Woman gives birth during a flight from Africa to the US
dis: 1.8857401609420776
---------
1026056183: Dr. David Agus breaks down new Omicron subvariant BA.2, notes vaccine booster offers "significant protection"
dis: 1.7483699321746826
---------
1026015896:

In [44]:
all_df[all_df.GLOBALEVENTID == 1026142787].SOURCEURL.values

array(['http://www.msn.com/en-us/news/technology/white-house-praises-spotify-s-new-covid-disclaimers/ar-AATnxWo'],
      dtype=object)

In [29]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026435040: Tucker Carlson Launches Ugly Attack On Meghan Markle And Prince Harry
-.-.--.-.--.-.-
1026142787: White House praises Spotify's new Covid disclaimers
dis: 3.0883123874664307
---------
1026076554: Piers Morgan—Forced Out by Meghan Markle—Uses New Column To Attack Her in U.S.
dis: 2.713916301727295
---------
1026325387: Crosby, Stills And Nash Quit Spotify Following Neil Young's Protest Of Joe Rogan
dis: 2.40952730178833
---------
1026227928: 10 things in tech you need to know today
dis: 2.147632598876953
---------
1025985238: Teen rejects Elon Musk's $5,000 offer to shut down jet tracker
dis: 2.116379737854004
---------
1025960084: Piers Morgan, Citing North Korea, Defends Joe Rogan From ‘Cancel Culture Vultures’ Prince Harry and Meghan Markle
dis: 1.9451415538787842
---------
1026258878: Prince Andrew's lawyers to question Virginia Giuffre's 'role in trafficking' minor girls for Jeffrey Epstein
dis: 1.8739584684371948
---------
1026181414: Whoopi Goldberg suspended from "Th

In [30]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026514414: GOP senators introduce bill to prevent TSA from accepting arrest warrants as valid forms of ID
-.-.--.-.--.-.-
1026084808: Hawley seeks answers on ‘unacceptable’ TSA policy of allowing illegal immigrants to use warrants as ID
dis: 2.0483083724975586
---------
1026301087: Appeals court may overturn order keeping Burr search warrant secret
dis: 1.6962199211120605
---------
1026161419: Jan. 6 Panel Examining Trump’s Role in Proposals to Seize Voting Machines
dis: 1.5410571098327637
---------
1026290896: January 6 committee member says Trump 'absolutely' tampered with witnesses by dangling pardons for riot defendants
dis: 1.516289472579956
---------
1025985238: Teen rejects Elon Musk's $5,000 offer to shut down jet tracker
dis: 1.5035197734832764
---------
1026077028: The defense secretary tells Republican governors: National Guard troops must be vaccinated.
dis: 1.4908534288406372
---------
1026264693: Hawley calling on Biden admin to drop support for Ukraine's eventual member

In [32]:
match_df, ht_results = see(data, results, all_df, milvus_client)

1026371890: Erdogan eyes Ukraine summit with Putin on Kyiv visit
-.-.--.-.--.-.-
1026173405: Erdogan seeks payoff from Russia-US clash on Ukraine
dis: 3.0065860748291016
---------
1026134869: Jonah Goldberg: Ukraine’s president may be our only hope
dis: 2.697669506072998
---------
1026113745: Ukraine invasion would be ‘military disaster’ for Russia, warns Boris Johnson
dis: 2.6932146549224854
---------
1026106324: ‘Even schoolchildren are laughing at him’: What the world thinks of Boris Johnson
dis: 2.6563212871551514
---------
1026181494: Royal Navy warship monitors Russian vessels in English Channel
dis: 2.6456356048583984
---------
1026115924: Boris Johnson uses Ukraine trip to urge Russia to ‘step back’
dis: 2.5672030448913574
---------
1026197360: Erdogan seeks payoff from Russia-US clash on Ukraine
dis: 2.469400405883789
---------
1026211909: Ukraine warns of 'full-scale war' and tragedy in Europe if Russia attacks
dis: 2.419156074523926
---------
1026046676: Russia Is Positioned

In [51]:
ht_results.pvalues[ht_results.pvalues <= 0.05].index.to_list()

['embd5', 'embd7', 'embd9']

In [33]:
ht_results.summary()

0,1,2,3
Dep. Variable:,embd0,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.874
Method:,Least Squares,F-statistic:,88.8
Date:,"Tue, 19 Jul 2022",Prob (F-statistic):,8.16e-50
Time:,21:46:08,Log-Likelihood:,190.34
No. Observations:,128,AIC:,-358.7
Df Residuals:,117,BIC:,-327.3
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0005,0.005,0.105,0.917,-0.009,0.011
embd1,-0.0216,0.101,-0.214,0.831,-0.222,0.179
embd2,-0.2404,0.170,-1.412,0.161,-0.578,0.097
embd3,0.0017,0.058,0.030,0.976,-0.112,0.116
embd4,-0.0279,0.072,-0.388,0.699,-0.170,0.115
embd5,0.1728,0.056,3.081,0.003,0.062,0.284
embd6,0.0317,0.073,0.436,0.664,-0.112,0.176
embd7,0.4238,0.202,2.094,0.038,0.023,0.825
embd8,-0.1223,0.081,-1.517,0.132,-0.282,0.037

0,1,2,3
Omnibus:,0.195,Durbin-Watson:,2.281
Prob(Omnibus):,0.907,Jarque-Bera (JB):,0.366
Skew:,-0.032,Prob(JB):,0.833
Kurtosis:,2.746,Cond. No.,52.3


In [24]:
see(data, results, event_meta_df)

1026016979: British ex-pats in Ukraine fear being stranded amid Russian invasion fears
-.-.--.-.--.-.-
1026113658: Ukraine announces plan to boost army; US demands Russian de-escalation
dis: 5.080733299255371
---------
1025981962: Boris Johnson heading to Ukraine as he fights for his premiership after Sue Gray report
dis: 4.723462104797363
---------
1025980586: 'They feel like idiots': UK reacts after lockdown party report exposes Johnson
dis: 4.261681079864502
---------
1026045016: Ukraine focus shifts to diplomacy after bitter public clash between U.S., Russia
dis: 4.15915060043335
---------
1026046644: Russia starts huge war games in Belarus amid fears of war in Ukraine
dis: 4.149078845977783
---------
1026136360: Nurses begin English training after reports of mass failure
dis: 4.069224834442139
---------
1025991252: Russia sends written follow-up to U.S. amid Ukraine negotiations – "formal' response still to come
dis: 3.9621422290802
---------
1026136591: Info from missing 4-year-o

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_ADM2Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
16128,1025980586,20220201,202202,2022,2022.0849,GBR,UNITED KINGDOM,GBR,,,...,4,"Peterborough, Peterborough, United Kingdom",UK,UKK3,40098.0,52.5833,-0.25,-2605287,20220201021500,http://www.msn.com/en-nz/news/world/they-feel-...
16994,1025981962,20220201,202202,2022,2022.0849,,,,,,...,4,"Moscow, Moskva, Russia",RS,RS48,25106.0,55.7522,37.6156,-2960561,20220201023000,https://www.msn.com/en-xl/europe/europe-top-st...
24312,1025991252,20220125,202201,2022,2022.0685,EUR,EUROPEAN,EUR,,,...,1,Ukraine,UP,UP,,49.0,32.0,UP,20220201040000,https://www.cbsnews.com/news/russia-responds-u...
41644,1026016979,20220201,202202,2022,2022.0849,,,,,,...,1,Russia,RS,RS,,60.0,100.0,RS,20220201083000,https://www.msn.com/en-gb/money/other/british-...
57675,1026045016,20220201,202202,2022,2022.0849,GOV,SECURITY COUNCIL,,,,...,2,"New York, United States",US,USNY,,42.1497,-74.9384,NY,20220201120000,https://www.msn.com/en-us/news/world/ukraine-f...
58328,1026046644,20220131,202201,2022,2022.0849,RUS,RUSSIA,RUS,,,...,1,Hungary,HU,HU,,47.0,20.0,HU,20220201121500,https://www.msn.com/en-gb/news/world/russia-st...
76201,1026076554,20220201,202202,2022,2022.0849,GBR,UNITED KINGDOM,GBR,,,...,4,"Sussex, East Sussex, United Kingdom",UK,UKE2,40137.0,50.9167,-0.083333,-2609142,20220201153000,http://www.msn.com/en-us/news/world/piers-morg...
99177,1026113658,20220201,202202,2022,2022.0849,GBR,UNITED KINGDOM,GBR,,,...,4,"Kyiv, Kyyiv, Misto, Ukraine",UP,UP12,28554.0,50.4333,30.5167,-1044367,20220201191500,https://www.msn.com/en-in/news/world/ukraine-a...
113731,1026136360,20220201,202202,2022,2022.0849,,,,,,...,4,"Northern Ireland, Craigavon, United Kingdom",UK,UKR8,40152.0,54.5,-6.5,-2604275,20220201221500,https://www.msn.com/en-xl/africa/other/nurses-...
113962,1026136591,20220201,202202,2022,2022.0849,COP,POLICE,,,,...,2,"Virginia, United States",US,USVA,,37.768,-78.2057,VA,20220201221500,https://www.msn.com/en-us/news/crime/info-from...
