---

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import random

In [2]:
class config:
    seed = 42

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [4]:
seed_everything(config.seed)

# 1. Data Description
- id : claim id
- label : 0(normal) / 1(fake)
- published_date : 보도 날짜
- keybert_keywords : kebert를 통해 추출한 기사 키워드
- ner_keywords : ner를 통해 추출한 기사 키워드
- youtube0 ~ youtube9 : 해당 키워드로 유트브에 검색했을 때 나오는 상위 10개의 유투브의 제목, 설명 텍스트

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [7]:
# train data: 3868개의 기사로 구성
train

Unnamed: 0,id,label,claim,published_date,keybert_keywords,ner_keywords,youtube0,youtube1,youtube2,youtube3,youtube4,youtube5,youtube6,youtube7,youtube8,youtube9
0,0,0,Did a Vermont Woman Post a Joke About Trump Br...,2018-01-25,"['trump', 'vermont', 'woman']","['vermont', 'trump', 'woman']",Citizen Trump A talk given at the University ...,News Donald Trump has a great woman problem N...,Trump Supporters Remain Despite Vulgar Slur J...,BREAKING Donald trump has a great woman proble...,Revolutionary War SNL The Patriots from New E...,World News Donald Trump has a great woman prob...,Latest News Donald Trump has a great woman pro...,AM Executive Session Gender Matters The U S Pr...,Trump Discusses Tax Bill Gives ;Total Support ...,Chicago s Women s March January This is what ...
1,1,1,Were Baseball Players Photographed Kneeling to...,2017-11-08,"['lynchings', 'baseball', 's']","['lynchings', 'baseball', 's']",Tony Shalhoub Still ;Monk ; after all these ye...,Lesson Class differences and racial variation ...,Mass Media Mass Culture and the Golden Age of ...,Mark W Bennett quot;Implicit Bias and the Law ...,Chris Hedges Writing as Resistance presented b...,Race in America FOCIS th Anniversary lecture s...,,,,
2,2,0,Wisconsin state Rep. John Nygren hits pay dirt...,2019-02-07,"['tax', 'wisconsin', 'dirt']","['rep', 'wisconsin', 'tax', 'wisconsin']",Pocan Shutdown Waste of Time Energy Pocan Shu...,Lawmaker This couldn ;t be more embarrassing f...,State Rep Bob Behning Creating teacher career ...,US GOV th,Representative John Macco of the th talks the ...,Democratic Senator Calls On Va Gov To Resign M...,Rep Sean Duffy introduces the Reciprocal Trade...,Rep Dan Caulkins on IL Minimum Wage Hike Stat...,Rep Weber Worried More Jobs Will Leave IL Rea...,Foxconn May Not Build B Wisconsin Plant Presid...
3,3,0,Progressive group accuses Senate splinter grou...,2017-11-04,"['donors', 'cashing', 'senate']","['cashing', 'senate', 'donors']",Roy Moore s Attorney Refers To News Anchor ;s ...,Documents that Changed the Way We Live Docume...,Time to Wake Up EPA Nominees Show Blatant Disr...,Училище по персонализирана медицина Панел Учи...,Conference Theme Panel Economics for a New Pro...,,,,,
4,4,0,Seal with Unusual Stripe Pattern Markings Spot...,2016-10-21,"['seal', 'washington', 'markings']","['seal', 'washington', 'markings']",Parking Lot Striping Garland Texas Asphalt st...,Learn about the MAHLE Original Gasket Line Bi...,Austroads Guide to Road Design Part Session of,I Major Deck and Superstructure Rehabilitation...,The Premier Parking Lot Striping Company in Au...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5272,5272,0,Does Hillary Clinton want a $15 or $12 minimum...,2016-04-14,"['hillary', 'clinton']","['hillary clinton', 'hillary', 'clinton']",Hillary Clinton Addresses Her Losing Streak Co...,Stronger Together Hillary Clinton Donald Trum...,Love and Kindness Hillary Clinton Hillary Cli...,Hillary Clinton Interview at The Breakfast Clu...,Hillary Clinton Reveals Bill ;s Most Annoying ...,Stephen Interviews Hillary Clinton At Carnegie...,Hillary Clinton ;s entire Pennsylvania primary...,Hillary Clinton on NY Primary Win FULL SPEECH ...,Thank you New York Hillary Clinton In her New...,Hillary Clinton ;s full New York primary victo...
5273,5273,1,George Soros s son isn t married to Adam Schif...,2019-10-10,"['sister', 'married', 'son']","['sister', 'married', 'son']",Norman Lusts After His Mother Bates Motel Scen...,Cenk ve Cemre Evlendi Şeniz Çıldırdı Zalim İs...,Seven Year Old Mykal Michelle Harris on Mixed ...,My Sister Song CoComelon Nursery Rhymes amp; K...,quot;He annoys me so much quot; Naomi Osaka On...,MA SŒUR SE MARIE JUST MARRIED Lévanah amp;Fami...,Sheikh Mohammed bin Rashid ;s daugher Maryam m...,Amelia and Link Tell Meredith About the Baby G...,Tamron Hall Talks with Her Nephew About the Mu...,Thomas Doherty Juliet Doherty amp; Harry Jarvi...
5274,5274,0,Sanders: It's not impossible to get enough del...,2016-05-01,"['win', 'sanders', 'delegates']","['delegates', 'win', 'sanders']",Bernie Sanders Our Goal Is To Win Pledged Dele...,Sanders looks to win crucial delegates in Indi...,Can Clinton win with pledged delegates alone ...,Clinton Sanders battle for Indiana ;s delegate...,Can Bernie Sanders Win New York Bernie Sander...,Sanders campaign turns focus to superdelegates...,Bernie Sanders Determined to Fight for Every D...,I ;m In It to Win It Bernie Sanders If you wa...,Bernie Sanders Delegate Reacts to New York Pri...,Bernie Sanders Celebrates West Virginia Win NB...
5275,5275,1,Donald Trump Pledged to Rename New Mexico?,2016-02-29,"['trump', 'mexico', 'donald']","['donald', 'mexico', 'trump']",Trump compares border wall to Great Wall of Ch...,How Donald Trump plans to build a US Mexico bo...,How we can build Trump ;s border wall What wo...,Donald Trump says he will make Mexico pay for ...,Donald Trump To Protestors ;Are You From Mexic...,Donald Trump Blames Mexico For Pope Criticism ...,Enrique Peña Nieto Mexico will not pay for Don...,Video Donald Trump supporter tells protestor q...,Mexico accuses Donald Trump of sounding like a...,KIDS REACT TO DONALD TRUMP Kids sit down and ...


In [8]:
# test data: 1856개의 기사로 구성 (test에는 라벨값이 없으므로, 이를 예측하여 submission 해야 함.)
test

Unnamed: 0,id,claim,published_date,keybert_keywords,ner_keywords,youtube0,youtube1,youtube2,youtube3,youtube4,youtube5,youtube6,youtube7,youtube8,youtube9
0,0,According to the CDC so far this year Florida ...,2020-05-27,['pneumonia'],"['florida', 'pneumonia', 'cdc']",Watch Full Coronavirus Coverage May NBC News N...,Watch Full Coronavirus Coverage May NBC News N...,Ask the expert Here ;s why screening is a good...,MORE Questions for an Infectious Disease Speci...,Coronavirus and Cuba,Understanding and Managing Coronavirus in SNF ...,SARS CoV Pandemic Caltech Seminar Day Online S...,CAH Swingbed Initiative with Kerry Dunning Ke...,Diabetics and Novel Coronavirus ▻ Theresa De ...,CDPHE explains how it classifies COVID deaths ...
1,1,Claim that Rep. Alexandria Ocasio-Cortez tweet...,2020-06-23,"['november', 'elections', 'tweeted']","['covid', 'us', 'rep']",Maps show explosive growth of coronavirus in U...,Rep Raja Krishnamoorthi on Continued Fallout o...,Biologist says young people are fueling Covid ...,Former Allergan CEO on the most likely timelin...,Rep DeGette questions top U S health officials...,Rep Kevin McCarthy on Covid spikes reopening t...,Rep Kevin Brady on back to work bonus proposal...,Florida and Texas struggle with surging corona...,What flying in the U S amid the coronavirus pa...,U S Virus Surge Ample Opportunity for Covid to...
2,2,Models projecting COVID-19 deaths are talking ...,2020-05-05,"['deaths', 'mitigation']","['deaths', 'covid', 'mitigation']",President Donald Trump downplays models projec...,Trump keeps predicting coronavirus death tolls...,Trump Claims New Estimate Predicting More Coro...,Is COVID Seasonal View CME information and cl...,The Bubonic Plague and COVID – Two Diseases in...,Coronavirus outbreak B C reports new cases add...,Coronavirus Number of UK deaths decline from p...,Coronavirus model projects deaths in U S to ne...,Modeling COVID Transmission and Containment in...,UK records most Covid deaths in Europe but Raa...
3,3,A video has been viewed tens of thousands of t...,2020-05-02,"['instagram', 'facebook', 'tea']","['covid', 'uk', 'boris johnson']",Coronavirus Boris Johnson updates nation on Co...,Coronavirus How Boris Johnson government ;s po...,Boris Johnson unveils new COVID lockdown rules...,Coronavirus Public challenge Boris Johnson ;s ...,Coronavirus UK now ;past the peak ; says Boris...,UK PM Boris Johnson returns to work after reco...,Boris Johnson We are past the peak of disease ...,In full Boris Johnson says he won ;t lift UK c...,Boris Johnson s new stay alert message splits ...,Watch Back Boris Johnson Addresses UK On State...
4,4,A video shows how Indian police frightened tho...,2020-04-24,"['ambulance', 'police', 'frightened']","['covid', 'indian', 'ambulance']",Watch TN Police put lockdown violators in ambu...,Fake Covid patient TN Police releases awarenes...,TN Police ;Prank ; Lockdown Violators With Fak...,Police drone catches teens violating COVID loc...,India s mobile coronavirus testing van hits Ne...,Ambulance Traffic Jams At Moscow Hospitals As ...,;India ;s Covid recovery rate ; Union govt on ...,Police drone interrupts couple ;s romantic get...,Watch Tamil Nadu Police put lockdown violators...,Pak Reaction on Indian Police Prank Offenders ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851,1851,COMMENTARY: COVID-19 Diary Day 6: Clinicians' ...,2020-04-10,"['clinicians', 'diary']","['clinicians', 'diary']",Adding Services and Products,Adding Clinicians,Acting Director ;s Message April,Adding and Editing Account Users,Advice for New Therapists Subscribe now for m...,COVID Clinical Webinar,Diary of a Researcher Day April Libby Researc...,THE HARDEST SURGERY OPERATION I ;VE EVER EXPER...,Configuring Business Basics,Quick Navigational Tips
1852,1852,Facebook and Instagram posts shared thousands ...,2020-05-25,"['instagram', 'facebook', 'vaccines']","['instagram', 'facebook', 'vaccines']",Ousted vaccine chief testifies Americans deser...,Anti Vaxxers Could Pose Public Health Risk Whe...,Hope in Thailand for a ;cheaper ; coronavirus ...,Coronavirus crisis These four COVID vaccines a...,How Our Lifestyle is Going To Be Transformed A...,How do vaccines work and what are their variou...,CEO Pascal Soriot on COVID vaccine CNN AstraZ...,The Market for Emergency Vaccines Is Like No O...,CR FOR VACCINE DEVELOPMENT STATUS OF ALL VACCI...,Trials show progress on Covid immunity vaccine...
1853,1853,Could an antiparasitic drug kill off SARS-CoV-...,2020-04-21,"['antiparasitic', 'drug', 'kill']","['sars', 'antiparasitic', 'drug']",Truth about the corona curing drug Ivermectin ...,Zinc Ascorbic Acid Steroids Fluids and Ivermec...,Intro to Viruses Antivirals and Vaccines Dr Pa...,Cellular Mechanism of Action of Ivermectin Ant...,EYE PARASITES HOW TO KILL A PARASITE IN THE EY...,Ringworm Tinea Corporis Causes Risk Factors Si...,COVID UPDATE HYDROXYCHLOROQUINE WHEN amp; HOW ...,Could Ivermectin Help Fight COVID Ivermectin ...,La ivermectina ¿Ocurrencia o creencia para tra...,FDA approved drug Ivermectin inhibits replicat...
1854,1854,A video has been have been viewed thousands of...,2020-04-03,"['facebook', 'covid', 'sanitiser']","['covid', 'facebook', 'sanitiser']",Students produce hand sanitizer to fight COVID...,How to Draw a Bottle of Hand Sanitizer Cartoon...,I Can ;t Get No Sanitiser A Covid Parody Famil...,Soap vs Hand Sanitizer Which is best for preve...,Inside a Dubai hand sanitiser factory With th...,Coronavirus Hand sanitiser vs soap which prote...,India Patna installs full body sanitiser tunne...,Coronavirus; Old KSRTC Bus Turned Into Mobile ...,Coronavirus Hand sanitiser accidental poisonin...,COVID APMC vegetable market in Navi Mumbai set...


In [9]:
# test 데이터 내 1856개의 노드에 대해 예측된 라벨 (sample에서는 모두 0으로 초기화되어 있음.)
submission

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1851,1851,0
1852,1852,0
1853,1853,0
1854,1854,0


# 2. Graph Example

In [10]:
# install dgl
# https://www.dgl.ai/pages/start.html

!pip install -q dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.4/52.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [11]:
import numpy as np
from scipy.sparse import coo_matrix
import torch
import dgl

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [12]:
train.head(20)

Unnamed: 0,id,label,claim,published_date,keybert_keywords,ner_keywords,youtube0,youtube1,youtube2,youtube3,youtube4,youtube5,youtube6,youtube7,youtube8,youtube9
0,0,0,Did a Vermont Woman Post a Joke About Trump Br...,2018-01-25,"['trump', 'vermont', 'woman']","['vermont', 'trump', 'woman']",Citizen Trump A talk given at the University ...,News Donald Trump has a great woman problem N...,Trump Supporters Remain Despite Vulgar Slur J...,BREAKING Donald trump has a great woman proble...,Revolutionary War SNL The Patriots from New E...,World News Donald Trump has a great woman prob...,Latest News Donald Trump has a great woman pro...,AM Executive Session Gender Matters The U S Pr...,Trump Discusses Tax Bill Gives ;Total Support ...,Chicago s Women s March January This is what ...
1,1,1,Were Baseball Players Photographed Kneeling to...,2017-11-08,"['lynchings', 'baseball', 's']","['lynchings', 'baseball', 's']",Tony Shalhoub Still ;Monk ; after all these ye...,Lesson Class differences and racial variation ...,Mass Media Mass Culture and the Golden Age of ...,Mark W Bennett quot;Implicit Bias and the Law ...,Chris Hedges Writing as Resistance presented b...,Race in America FOCIS th Anniversary lecture s...,,,,
2,2,0,Wisconsin state Rep. John Nygren hits pay dirt...,2019-02-07,"['tax', 'wisconsin', 'dirt']","['rep', 'wisconsin', 'tax', 'wisconsin']",Pocan Shutdown Waste of Time Energy Pocan Shu...,Lawmaker This couldn ;t be more embarrassing f...,State Rep Bob Behning Creating teacher career ...,US GOV th,Representative John Macco of the th talks the ...,Democratic Senator Calls On Va Gov To Resign M...,Rep Sean Duffy introduces the Reciprocal Trade...,Rep Dan Caulkins on IL Minimum Wage Hike Stat...,Rep Weber Worried More Jobs Will Leave IL Rea...,Foxconn May Not Build B Wisconsin Plant Presid...
3,3,0,Progressive group accuses Senate splinter grou...,2017-11-04,"['donors', 'cashing', 'senate']","['cashing', 'senate', 'donors']",Roy Moore s Attorney Refers To News Anchor ;s ...,Documents that Changed the Way We Live Docume...,Time to Wake Up EPA Nominees Show Blatant Disr...,Училище по персонализирана медицина Панел Учи...,Conference Theme Panel Economics for a New Pro...,,,,,
4,4,0,Seal with Unusual Stripe Pattern Markings Spot...,2016-10-21,"['seal', 'washington', 'markings']","['seal', 'washington', 'markings']",Parking Lot Striping Garland Texas Asphalt st...,Learn about the MAHLE Original Gasket Line Bi...,Austroads Guide to Road Design Part Session of,I Major Deck and Superstructure Rehabilitation...,The Premier Parking Lot Striping Company in Au...,,,,,
5,5,1,Donald Trump Protester Speaks Out I Was Paid 3...,2016-06-06,"['trump', 'rally', 'protest']","['rally', 'protest', 'trump']",Protesters Interrupt Trump Rally in Albuquerqu...,Protests outside Trump Rally in Tampa mostly p...,Trump faces persistent interruptions at New Me...,Protest At Trump Rally CBS s Mary Calvi reports,Donald Trump San Diego rally protest livestrea...,Donald Trump San Diego rally protest livestrea...,Trump gets warm welcome at veterans biker rall...,Trump speaks at Rolling Thunder biker rally in...,Donald Trump Protests at Anaheim Convention Ce...,Dallas police hold crowd management training i...
6,6,1,Are Thieves Stealing License Plates in a Carja...,2016-12-22,"['carjacking', 'thieves', 'stealing']","['carjacking', 'thieves', 'stealing']",Arrested In Intricate Torrance Car Theft Ring ...,Spider Man Movie Uncle Ben ;s Death Scene Movi...,Police warn of uptick in thefts while locked c...,GTA Online How To Set Up an Import Export Car ...,Video year old attempted carjacking suspect sh...,Grand Theft Auto Voice Actors Grand Theft Aut...,Paul Wilbur First Fridays I Believe In Miracle...,DEPUTY ASSAULT VEHICLE RECOVERED Montgomery C...,GTA Vice City Car Jacking GTA vice city,Public Safety Concerns A public safety survey...
7,7,1,Did MSNBC Reporter Say I Hope Coronavirus Kill...,2020-03-16,"['covid', 'msnbc', 'kills']","['covid', 'trump', 'covid', 'msnbc']",Tracking President Trump ;s Response To COVID ...,The Cult Of Trump During Coronavirus All In MS...,Trump Addresses The Nation On Coronavirus From...,Rebuffed Watch Trump s Own Medical Expert Fact...,Joe Biden vs Donald Trump on the COVID Respons...,Trump calls coronavirus ;the Chinese virus ; ...,Donald Trump ;s coronavirus timeline how the P...,Coronavirus crisis could cost Trump the electi...,U S Ill Served By Trump ;s Repeated Lying Abou...,Chris Hayes On Trump s Failing Response To Cor...
8,8,0,Rep. Tom Cole: Half of Oklahoma homes have a gun,2018-03-01,"['gun', 'oklahoma', 'tom']","['rep', 'gun', 'oklahoma']",Tulsa Project General Meeting March th Tulsa ...,Rep Tim Ryan We Need A Firmer Position On Guns...,Rep Josh West Capitol Update District,Golden State Warriors coach Steve Kerr speaks ...,Rep Cole quot;Let ;s just start doing the thin...,Unpacking America ;s perceptions about mass sh...,Punting on gun control is cowardly Andrew Feld...,Pa Church Holds Ceremony with AR Rifles Worsh...,Tennessee Capitol Report February This episod...,There Ought To Be A Law Overcriminalization R...
9,9,1,A claim has circulated on social media that a ...,2020-03-24,"['covid', 'hantavirus', 'virus']","['covid', 'china', 'covid', 'hantavirus']",After Coronavirus Hantavirus Wreaks Havoc In C...,Hantavirus China Here s Why Hantavirus Isn ;t ...,Hantavirus Appears In China Amid COVID Pandemi...,Coronavirus के बाद China में Hantavirus की आफत...,New Covid Like Hantavirus Emerged In China ka...,क्या होता है Hanta Virus जो Corona Virus के बा...,Y DESPUÉS DEL CORONAVIRUS ¡EL HANTAVIRUS Virus...,Hantavirus in China New deadly virus after cor...,Hantavirus after coronavirus in China More Fat...,After Coronavirus Hantavirus Wreaks In China T...


In [13]:
total = pd.concat([train,test])

In [14]:
total = total.reset_index(drop=True)

In [15]:
total['claim'][0]

'Did a Vermont Woman Post a Joke About Trump Bringing Back Slavery ?'

In [16]:
youtube_pool = []
for idx in range(len(total)):
  for i in range(10):
    col_name = 'youtube'+str(i)
    if len(total[col_name][idx])!=1:
      youtube_pool.append(total[col_name][idx])
len(youtube_pool)

68256

In [17]:
len(set(youtube_pool)) #겹치는 비디오 존재

59875

In [18]:
video_list = list(set(youtube_pool))

In [19]:
follow_dst = [] #news
follow_src = [] #video
for idx in range(len(total)):
  for i in range(10):
    col_name = 'youtube'+str(i)
    if len(total[col_name][idx])!=1:
      follow_dst.append(idx)
      follow_src.append(video_list.index(total[col_name][idx]))

In [20]:
len(follow_src), len(follow_dst)

(68256, 68256)

In [21]:
for row in range(len(total)):
  total['ner_keywords'][row] = total['ner_keywords'][row].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total['ner_keywords'][row] = total['ner_keywords'][row].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")


In [22]:
share_src = []
share_dst = []
for i, i_keywords in enumerate(total['ner_keywords']):
  for j, j_keywords in enumerate(total['ner_keywords']):
    intersection = list(set(i_keywords)&set(j_keywords))
    if len(intersection)!=0:
      share_src.append(i)
      share_dst.append(j)

In [23]:
len(share_src), len(share_dst)

(3319317, 3319317)

In [24]:
ew = []
for i, i_keywords in enumerate(total['ner_keywords']):
  for j, j_keywords in enumerate(total['ner_keywords']):
    intersection = list(set(i_keywords)&set(j_keywords))
    if len(intersection)>=3:
      ew.append(3)
    elif len(intersection)==2:
      ew.append(2)
    elif len(intersection)==1:
      ew.append(1)
len(ew)

3319317

In [25]:
hetero_graph = dgl.heterograph({
    ('video', 'follow', 'news'): (follow_src,follow_dst), 
    ('news', 'followed_by', 'video'): (follow_dst,follow_src),
    ('news', 'share_keyword_with', 'news'): (share_src, share_dst)
    })

In [26]:
hetero_graph = hetero_graph.to(device)

In [27]:
hetero_graph

Graph(num_nodes={'news': 7133, 'video': 59875},
      num_edges={('news', 'followed_by', 'video'): 68256, ('news', 'share_keyword_with', 'news'): 3319317, ('video', 'follow', 'news'): 68256},
      metagraph=[('news', 'video', 'followed_by'), ('news', 'news', 'share_keyword_with'), ('video', 'news', 'follow')])

In [28]:
pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [29]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, models

news_sentences = total['claim'].tolist()

se_model = SentenceTransformer('Pavankalyan/Sentence_embedding_fine-tuned')

news_embeddings = se_model.encode(news_sentences)
video_embeddings = se_model.encode(video_list)

Downloading (…)b926c/.gitattributes:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7938bb926c/README.md:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Downloading (…)38bb926c/config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)b926c/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading (…)7938bb926c/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)8bb926c/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [30]:
news_embeddings.shape, video_embeddings.shape

((7133, 768), (59875, 768))

In [31]:
hetero_graph.nodes['news'].data['feature'] = torch.FloatTensor(news_embeddings).to(device)
hetero_graph.nodes['video'].data['feature'] = torch.FloatTensor(video_embeddings).to(device)

In [32]:
total['label'] = total['label'].fillna(0)
hetero_graph.nodes['news'].data['label'] = torch.LongTensor(total['label'].tolist()).to(device)

In [33]:
#Adding weight features
hetero_graph.edges['share_keyword_with'].data['weight'] = torch.Tensor(ew).to(device)

In [34]:
randint = np.random.randint(0,len(train),int(0.8*len(train)))

In [35]:
len(randint)

4221

In [36]:
train_mask = []
val_mask = []
test_mask = []
for n in range(len(train)):
  if n in randint:
    train_mask.append(True)
    val_mask.append(False)
    test_mask.append(False)
  else:
    train_mask.append(False)
    val_mask.append(True)
    test_mask.append(False)
for t in range(len(train),len(total)):
  train_mask.append(False)
  val_mask.append(False)
  test_mask.append(True)

In [37]:
len(train_mask), len(val_mask), len(test_mask)

(7133, 7133, 7133)

In [38]:
hetero_graph.nodes['news'].data['train_mask'] = torch.BoolTensor(train_mask).to(device)
hetero_graph.nodes['news'].data['val_mask'] = torch.BoolTensor(val_mask).to(device)
hetero_graph.nodes['news'].data['test_mask'] = torch.BoolTensor(test_mask).to(device)

In [39]:
# Define a Heterograph Conv model
import torch.nn as nn
from dgl.nn import GraphConv, SAGEConv, GATConv, HeteroGraphConv

class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = HeteroGraphConv({
            rel: SAGEConv(in_feats, hid_feats,aggregator_type='mean')
            for rel in rel_names}, aggregate='mean')
        
        self.conv2 = HeteroGraphConv({
            rel: SAGEConv(hid_feats, out_feats, aggregator_type='mean')
            for rel in rel_names}, aggregate='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [40]:
n_hetero_features = 768
hidden_feats = 64         
num_classes = 2
lr = 0.001
num_epochs = 200

In [41]:
model = RGCN(n_hetero_features, hidden_feats, num_classes, hetero_graph.etypes).to(device)

In [42]:
# 학습에 필요한 데이터 할당 (목적: 뉴스 클래스(허위) 분류)
news_feats = hetero_graph.nodes['news'].data['feature']
video_feats = hetero_graph.nodes['video'].data['feature']
labels = hetero_graph.nodes['news'].data['label']
edge_weight = hetero_graph.edges['share_keyword_with'].data['weight']
train_mask = hetero_graph.nodes['news'].data['train_mask']
val_mask = hetero_graph.nodes['news'].data['val_mask']
test_mask = hetero_graph.nodes['news'].data['test_mask']

In [43]:
hetero_graph.device, news_feats.device, video_feats.device

(device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0))

In [44]:
import torch.nn.functional as F

In [45]:
node_features = {'news': news_feats, 'video': video_feats}
h_dict = model(hetero_graph, {'news': news_feats, 'video': video_feats})
h_news = h_dict['news']
h_video = h_dict['video']

  assert input.numel() == input.storage().size(), "Cannot convert view " \


In [46]:
#logits = model(hetero_graph, node_features)['news']

In [47]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    # forward propagation by using all nodes and extracting the user embeddings
    logits = model(hetero_graph, node_features)['news']
    
    # compute loss
    train_loss = criterion(logits[train_mask], labels[train_mask])
    if epoch % 10 == 0:
        print(train_loss.item())
        
    # backward propagation
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # evaluation
    num_correct = 0
    num_tests = 0
    if epoch % 10 == 0:
      model.eval()
      with torch.no_grad():
        logits = model(hetero_graph, node_features)['news']
        val_loss = criterion(logits[val_mask], labels[val_mask])
        acc = (logits[val_mask].argmax(dim=1) == labels[val_mask]).float().mean().item()
        print( "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
        epoch, val_loss.item(), acc))

0.7912108302116394
Epoch 00000 | Loss 1.1095 | Accuracy 0.5044 
0.646106481552124
Epoch 00010 | Loss 0.5972 | Accuracy 0.6961 
0.5646876692771912
Epoch 00020 | Loss 0.5690 | Accuracy 0.7049 
0.511517345905304
Epoch 00030 | Loss 0.5450 | Accuracy 0.7244 
0.47302794456481934
Epoch 00040 | Loss 0.5332 | Accuracy 0.7343 
0.43584954738616943
Epoch 00050 | Loss 0.5265 | Accuracy 0.7422 
0.4001714289188385
Epoch 00060 | Loss 0.5203 | Accuracy 0.7468 
0.3650600016117096
Epoch 00070 | Loss 0.5192 | Accuracy 0.7480 
0.33039599657058716
Epoch 00080 | Loss 0.5188 | Accuracy 0.7493 
0.29660746455192566
Epoch 00090 | Loss 0.5210 | Accuracy 0.7455 
0.26406291127204895
Epoch 00100 | Loss 0.5250 | Accuracy 0.7430 
0.23326647281646729
Epoch 00110 | Loss 0.5310 | Accuracy 0.7372 
0.2048458307981491
Epoch 00120 | Loss 0.5393 | Accuracy 0.7385 
0.1792113482952118
Epoch 00130 | Loss 0.5488 | Accuracy 0.7410 
0.15643200278282166
Epoch 00140 | Loss 0.5593 | Accuracy 0.7426 
0.13648772239685059
Epoch 00150 | L

In [48]:
model.eval()
with torch.no_grad():
    logits = model(hetero_graph, node_features)['news']

In [49]:
logits[test_mask]

tensor([[ 3.3047,  1.8273],
        [ 2.4886,  3.2032],
        [ 2.4301,  2.3945],
        ...,
        [ 4.0169,  0.6004],
        [-0.3452,  7.1576],
        [ 0.6187,  5.1358]], device='cuda:0')

In [50]:
len(logits[test_mask])

1856

In [51]:
result = logits[test_mask].argmax(dim=1).cpu()

In [52]:
submission['label'] = result
submission['label'].value_counts()

0    932
1    924
Name: label, dtype: int64

In [53]:
submission.to_csv("s20.csv", index=False)