In [1]:
#---------------------------------------------------------------------------------
#   Author Name:        Camm Perera             
#   Create Date:        11-15-2022
#   Description:        DSE-203 - Group #5, NFL-CTE Knowledge Graph
#   System specs:       
#        MacOS Monterey   : 12.5.1 
#        Python           : 3.8.13 
#        IPython          : 8.4.0
#        ipykernel        : 6.15.2
#        ipywidgets       : 7.6.5
#        jupyter_client   : 6.1.12
#        jupyter_core     : 4.10.0
#        jupyter_server   : 1.18.1
#        jupyterlab       : 3.4.4
#        nbclient         : 0.5.13
#        nbconvert        : 6.4.4
#        nbformat         : 5.5.0
#        notebook         : 6.4.12
#        qtconsole        : 5.3.2
#        traitlets        : 5.1.1
# #---------------------------------------------------------------------------------

In [2]:
import sys
import py_stringmatching as sm 
import py_entitymatching as em
import pandas as pd
import numpy as np 
import re, string, math, time
import wikipedia
import stanza
import requests
import csv
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Load  Kaggle Dataset

In [3]:
basic_stats_df = em.read_csv_metadata("./Datasets/Basic_Stats.csv" ,key='Player Id')
# career_stats_defensive_df = pd.read_csv("./Datasets/Career_Stats_Defensive.csv", low_memory=False)
# career_stats_field_goal_kickers_df = pd.read_csv("./Datasets/Career_Stats_Field_Goal_Kickers.csv", low_memory=False)
# career_stats_fumbles_df = pd.read_csv("./Datasets/Career_Stats_Fumbles.csv", low_memory=False)
# career_stats_kick_return_df = pd.read_csv("./Datasets/Career_Stats_Kick_Return.csv", low_memory=False)
# career_stats_kickoff_df = pd.read_csv("./Datasets/Career_Stats_Kickoff.csv", low_memory=False)
# career_stats_offensive_line_df = pd.read_csv("./Datasets/Career_Stats_Offensive_Line.csv", low_memory=False)
# career_stats_passing_df = pd.read_csv("./Datasets/Career_Stats_Passing.csv", low_memory=False)
# career_stats_punt_return_df = pd.read_csv("./Datasets/Career_Stats_Punt_Return.csv", low_memory=False)
# career_stats_punting_df = pd.read_csv("./Datasets/Career_Stats_Punting.csv", low_memory=False)
# career_stats_receiving_df = pd.read_csv("./Datasets/Career_Stats_Receiving.csv", low_memory=False)
# career_stats_rushing_df = pd.read_csv("./Datasets/Career_Stats_Rushing.csv", low_memory=False)


Metadata file is not present in the given path; proceeding to read the csv file.


In [4]:
# len(basic_stats_df), len(career_stats_defensive_df), len(career_stats_field_goal_kickers_df), len(career_stats_fumbles_df) \
# ,len(career_stats_kick_return_df), len(career_stats_kickoff_df), len(career_stats_offensive_line_df) ,len(career_stats_passing_df) \
# ,len(career_stats_punt_return_df) ,len(career_stats_punting_df) ,len(career_stats_receiving_df) , len(career_stats_rushing_df)

In [5]:
# # Load Kaggle dataset stored in GitHub repo - DOESN'T WORK for pyentitymatching
# kaggle_data_url = r"https://raw.githubusercontent.com/mona-jandro-camm/dse203/main/Datasets/Basic_Stats.csv"
# basic_stats_df = em.read_csv_metadata(kaggle_data_url)
# basic_stats_df

### Extract Wikipedia Data

In [6]:
wiki_title = 'List of NFL players with chronic traumatic encephalopathy'
wiki_url = 'https://en.wikipedia.org/wiki/List_of_NFL_players_with_chronic_traumatic_encephalopathy'

# Python Wikipedia library
wiki_page_object     = wikipedia.page(wiki_title)

# Python Beautiful Soup
wiki_page = requests.get(wiki_url)
soup = BeautifulSoup(wiki_page.content, "lxml")

# print(soup.prettify())

#### Stanza - stanford NLP

In [7]:
nlp = stanza.Pipeline('en', processors='tokenize,mwt,ner', use_gpu=False, pos_batch_size=3000, download_method=None)  # This sets up a default neural pipeline in English

2022-12-04 11:07:12 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-12-04 11:07:12 INFO: Use device: cpu
2022-12-04 11:07:12 INFO: Loading: tokenize
2022-12-04 11:07:12 INFO: Loading: ner
2022-12-04 11:07:13 INFO: Done loading processors!


#### Process Players Affected Wiki Section

In [8]:
# Lists to store player names by category
players_affected_ls = []

# Wiki-Extract Players affected
players_affected_ls = soup.select('p')[4:8] 

# Set start time to calculate compute time
start_time = time.time()

# Create lists to hold Person lists
affected_players_ls = []

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
for index in players_affected_ls:
    doc = nlp(str(index))                                 

    # Extract PERSON & ORG entities
    for ent in doc.ents:
        if (ent.type =='PERSON'):
            clean_name = re.split('</a', ent.text)[0] 
            affected_players_ls.append(clean_name)
            
# Dedupe list contents
affected_players_ls = [*set(affected_players_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of person: {len(affected_players_ls)}')

Exec time --- 4.169011831283569 seconds ---
# of person: 9


#### Process Former Players affected with CTE Wiki Section

In [9]:
# Lists to store player names by category
former_players_post_mortem_ls = []
pm_former_players_ls = []

# Wiki-Extract Former players with CTE confirmed post-mortem
results = soup.select('ul')[1]
former_players_post_mortem_ls = results.find_all("a")

# Set start time to calculate compute time
start_time = time.time()

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
for index in former_players_post_mortem_ls:
    doc = nlp(str(index))                                 

    # Extract PERSON entities
    for ent in doc.ents:
        if (ent.type =='PERSON'):
            clean_name = re.split('</a', ent.text)[0] 
            pm_former_players_ls.append(clean_name)
            
# Dedupe list contents
pm_former_players_ls = [*set(pm_former_players_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of person: {len(pm_former_players_ls)}')

Exec time --- 19.32221007347107 seconds ---
# of person: 63


#### Process Deceased players suspected of having had CTE Wiki Sction

In [10]:
# Lists to store player names by category
deceased_players_ls = []
suspected_deceased_players_ls = []

# Wiki-Extract Former players with CTE confirmed post-mortem
results = soup.select('ul')[2]
deceased_players_ls = results.find_all("a")

# Set start time to calculate compute time
start_time = time.time()

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
for index in deceased_players_ls:
    doc = nlp(str(index))                                 

    # Extract PERSON entities
    for ent in doc.ents:
        if (ent.type =='PERSON'):
            clean_name = re.split('</a', ent.text)[0] 
            suspected_deceased_players_ls.append(clean_name)
            
# Dedupe list contents
suspected_deceased_players_ls = [*set(suspected_deceased_players_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of person: {len(suspected_deceased_players_ls)}')

Exec time --- 1.506394863128662 seconds ---
# of person: 5


#### Process Living former players diagnosed with CTE or ALS or reporting symptoms consistent with CTE or ALS Wiki Section

In [11]:
# Lists to store player names by category
former_cte_als_players_ls = []
cte_als_former_players_ls = []

# Wiki-Extract Former players with CTE confirmed post-mortem
results = soup.select('ul')[3]
former_cte_als_players_ls = results.find_all("a")

# Set start time to calculate compute time
start_time = time.time()

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
for index in former_cte_als_players_ls:
    doc = nlp(str(index))                                 

    # Extract PERSON entities
    for ent in doc.ents:
        if (ent.type =='PERSON'):
            clean_name = re.split('</a', ent.text)[0] 
            cte_als_former_players_ls.append(clean_name)
            
# Dedupe list contents
cte_als_former_players_ls = [*set(cte_als_former_players_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of person: {len(cte_als_former_players_ls)}')

Exec time --- 9.88235092163086 seconds ---
# of person: 34


#### Process Former players listed as plaintiffs in lawsuits against the NFL for concussion-related injuries received after Wiki playing Section

In [12]:
# Lists to store player names by category
players_lawsuits_nfl_ls = []
players_nfl_lawsuits_ls = []

# Wiki-Extract Former players with CTE confirmed post-mortem
results = soup.select('ul')[4]
players_lawsuits_nfl_ls = results.find_all("a")

# Set start time to calculate compute time
start_time = time.time()

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
for index in players_lawsuits_nfl_ls:
    doc = nlp(str(index))                                 

    # Extract PERSON entities
    for ent in doc.ents:
        if (ent.type =='PERSON'):
            clean_name = re.split('</a', ent.text)[0] 
            players_nfl_lawsuits_ls.append(clean_name)
            
# Dedupe list contents
players_nfl_lawsuits_ls = [*set(players_nfl_lawsuits_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of person: {len(players_nfl_lawsuits_ls)}')

Exec time --- 636.9087760448456 seconds ---
# of person: 1880


### Text Normalization & Preprocessing

In [13]:
# ---------------------------------------------------
# Normalize player "Name" in Kaggle basic stats
# ---------------------------------------------------
basic_stats_df['Clean_Name'] = basic_stats_df.Name.str.lower().map(lambda s: s.split()[1] + ' ' + s.split()[0]).replace('[^\w\s]',' ', regex=True).str.strip()

In [14]:
# ---------------------------------------------------
# Remove punctuations & lower name 
# ---------------------------------------------------
def remove_punc(name):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in name:  
        if ele in punc:  
            names = name.replace(ele, " ") 
    return name.lower().strip()

affected_players_ls  = [remove_punc(i) for i in affected_players_ls]
pm_former_players_ls = [remove_punc(i) for i in pm_former_players_ls]
suspected_deceased_players_ls  = [remove_punc(i) for i in suspected_deceased_players_ls]
cte_als_former_players_ls  = [remove_punc(i) for i in cte_als_former_players_ls]
players_nfl_lawsuits_ls  = [remove_punc(i) for i in players_nfl_lawsuits_ls]

In [15]:
# --------------------------------------------------------
# Create combo dataframe for each list(above) category 
# --------------------------------------------------------
affected_players_df = pd.DataFrame(data=[['affected_players'] * len(affected_players_ls), affected_players_ls]).T
affected_players_df.columns = ['cte_category', 'Clean_Name']

pm_former_players_df = pd.DataFrame(data=[['pm_former_players'] * len(pm_former_players_ls), pm_former_players_ls]).T
pm_former_players_df.columns = ['cte_category', 'Clean_Name']

suspected_deceased_players_df = pd.DataFrame(data=[['suspected_deceased_players'] * len(suspected_deceased_players_ls), suspected_deceased_players_ls]).T
suspected_deceased_players_df.columns = ['cte_category', 'Clean_Name']

cte_als_former_players_df = pd.DataFrame(data=[['cte_als_former_players'] * len(cte_als_former_players_ls), cte_als_former_players_ls]).T
cte_als_former_players_df.columns = ['cte_category', 'Clean_Name']

players_nfl_lawsuits_df = pd.DataFrame(data=[['players_nfl_lawsuits'] * len(players_nfl_lawsuits_ls), players_nfl_lawsuits_ls]).T
players_nfl_lawsuits_df.columns = ['cte_category', 'Clean_Name']

# Combine dataframes
frames = [affected_players_df, pm_former_players_df, suspected_deceased_players_df, cte_als_former_players_df, players_nfl_lawsuits_df]
wiki_cte_players_df = pd.concat(frames)
wiki_cte_players_df

Unnamed: 0,cte_category,Clean_Name
0,affected_players,larry johnson
1,affected_players,stabler
2,affected_players,busm
3,affected_players,ken stabler
4,affected_players,johnson
...,...,...
1875,players_nfl_lawsuits,richard cash
1876,players_nfl_lawsuits,ricky siglar
1877,players_nfl_lawsuits,john turner
1878,players_nfl_lawsuits,roderick coleman


#### Create CSV file and em.DataFrame for Entity Matching 

In [16]:
# Create CSV & entity match dataframe for blocking
wiki_cte_players_df['rec_id'] = range(1, 1+len(wiki_cte_players_df))
wiki_cte_players_df.to_csv("./wiki_cte_players_df.csv")
wiki_person_df = em.read_csv_metadata("./wiki_cte_players_df.csv", key='rec_id')
wiki_person_df

Metadata file is not present in the given path; proceeding to read the csv file.


Unnamed: 0.1,Unnamed: 0,cte_category,Clean_Name,rec_id
0,0,affected_players,larry johnson,1
1,1,affected_players,stabler,2
2,2,affected_players,busm,3
3,3,affected_players,ken stabler,4
4,4,affected_players,johnson,5
...,...,...,...,...
1986,1875,players_nfl_lawsuits,richard cash,1987
1987,1876,players_nfl_lawsuits,ricky siglar,1988
1988,1877,players_nfl_lawsuits,john turner,1989
1989,1878,players_nfl_lawsuits,roderick coleman,1990


#### Block DataFrames to get Candidate set

In [17]:
#  Instantiate blocker objects:
# ------------------------------
# Create overlap blocker
ob = em.OverlapBlocker()

# Create attribute equivalence blocker
ab = em.AttrEquivalenceBlocker()

#### i. Overlap Block by 'player_name'

In [18]:
# # Block tables using full name
ob_fullname_cand = ob.block_tables(basic_stats_df, wiki_person_df, 'Clean_Name', 'Clean_Name', allow_missing=False,
                                l_output_attrs=['Player Id', 'Name',  'Age', 'Current Status', 'Birthday', 'College','High School', 'Clean_Name'],
                                r_output_attrs=['rec_id', 'Clean_Name', 'cte_category'],
                                overlap_size=2, verbose=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table[overlap_attr] = values
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [19]:
#  Matched candidates - Kaggle vs. Wiki page
ob_fullname_cand.to_csv('./OB_names_matched.csv')
ob_fullname_cand

Unnamed: 0,_id,ltable_Player Id,rtable_rec_id,ltable_Name,ltable_Age,ltable_Current Status,ltable_Birthday,ltable_College,ltable_High School,ltable_Clean_Name,rtable_Clean_Name,rtable_cte_category
0,0,larryjohnson/2505491,1,"Johnson, Larry",37.0,Retired,11/19/1979,Penn State,,larry johnson,larry johnson,affected_players
1,1,larryjohnson/2517701,1,"Johnson, Larry",108.0,Retired,3/28/1909,Haskell Indian,,larry johnson,larry johnson,affected_players
2,2,johnmackey/2519920,6,"Mackey, John",,Retired,9/24/1941,Syracuse,,john mackey,john mackey,affected_players
3,3,rayeasterling/2513421,7,"Easterling, Ray",,Retired,9/3/1949,Richmond,,ray easterling,ray easterling,affected_players
4,4,bojackson/2517329,9,"Jackson, Bo",54.0,Retired,11/30/1962,Auburn,,bo jackson,bo jackson,affected_players
...,...,...,...,...,...,...,...,...,...,...,...,...
859,859,byronchamberlain/2500044,1979,"Chamberlain, Byron",45.0,Retired,10/17/1971,Wayne State (Neb.),,byron chamberlain,byron chamberlain,players_nfl_lawsuits
860,860,kwameharris/2505578,1980,"Harris, Kwame",35.0,Retired,3/15/1982,Stanford,,kwame harris,kwame harris,players_nfl_lawsuits
861,861,gusotto/2522592,1982,"Otto, Gus",73.0,Retired,12/8/1943,Missouri,,gus otto,gus otto,players_nfl_lawsuits
862,862,zackwalz/2503529,1985,"Walz, Zack",41.0,Retired,2/13/1976,Dartmouth,,zack walz,zack walz,players_nfl_lawsuits


In [20]:
# # Debug blocker output : (FOR TESTING ONLY)
# #  Unmatched candidates - Kaggle vs. Wiki page
# corres = [('Clean_Name', 'Clean_Name')]
# ob_fullname_debug = em.debug_blocker(ob_fullname_cand, basic_stats_df, wiki_person_df, output_size=500, attr_corres=corres)

# # Display first few tuple pairs from the debug_blocker's output
# ob_fullname_debug  #.to_csv('./names_debug.csv')

#### ii. Attribute Block by 'player_name'

### <font color='red'> *** BETTER RESULTS THAN OVERLAP BLOCK ***</font>

In [21]:
# Block using 'full_name_dob' attribute
ab_fullname_cand = ab.block_tables(basic_stats_df, wiki_person_df, 'Clean_Name', 'Clean_Name', allow_missing=False,
                                l_output_attrs=['Player Id', 'Name',  'Age', 'Current Status', 'Birthday', 'College','High School', 'Clean_Name'],
                                r_output_attrs=['rec_id', 'Clean_Name', 'cte_category'], n_jobs=2)

In [22]:
#  Distinct matched candidates - Kaggle vs. Wiki page
ab_fullname_cand.groupby("ltable_Player Id").first().to_csv('./AB_names_matched.csv')
ab_fullname_cand.groupby("ltable_Player Id").first()

Unnamed: 0_level_0,_id,rtable_rec_id,ltable_Name,ltable_Age,ltable_Current Status,ltable_Birthday,ltable_College,ltable_High School,ltable_Clean_Name,rtable_Clean_Name,rtable_cte_category
ltable_Player Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
aaronbeasley/2499587,697,1731,"Beasley, Aaron",43.0,Retired,7/7/1973,West Virginia,,aaron beasley,aaron beasley,players_nfl_lawsuits
aaronjones/2558116,16,285,"Jones, Aaron",22.0,Active,12/2/1994,Texas-El Paso,Burges HS,aaron jones,aaron jones,players_nfl_lawsuits
adamhaayer/2504632,705,1353,"Haayer, Adam",40.0,Retired,2/22/1977,Minnesota,,adam haayer,adam haayer,players_nfl_lawsuits
adriandingle/2500398,491,1528,"Dingle, Adrian",39.0,Retired,6/25/1977,Clemson,,adrian dingle,adrian dingle,players_nfl_lawsuits
adrianyoung/2529565,260,519,"Young, Adrian",71.0,Retired,1/31/1946,USC,,adrian young,adrian young,players_nfl_lawsuits
...,...,...,...,...,...,...,...,...,...,...,...
willieoshodin/2502367,460,1050,"Oshodin, Willie",47.0,Retired,9/16/1969,Villanova,,willie oshodin,willie oshodin,players_nfl_lawsuits
willietaylor/2527127,347,300,"Taylor, Willie",61.0,Retired,12/9/1955,Pittsburgh,,willie taylor,willie taylor,players_nfl_lawsuits
woodythompson/2527305,305,214,"Thompson, Woody",64.0,Retired,8/20/1952,Miami (Fla.),,woody thompson,woody thompson,players_nfl_lawsuits
zackvalentine/2527833,468,1643,"Valentine, Zack",60.0,Retired,5/29/1957,East Carolina,,zack valentine,zack valentine,players_nfl_lawsuits


#### Process Organizations from Wiki Page

In [23]:
# Remove stop words
def RemoveStopWords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
             filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

# Remove numbers
def RemoveNumbers(text):
    return re.sub(r'\d+', '', text) 

# Remove Punctuations
def RemovePunctuations(text):
    # return re.sub(rf"[{string.punctuation}]", " ", text)
    return re.sub(r'[^\w\s]', ' ', text)
    
# Normalize text 
def NormalizeText(text):
    result = text
    # result = RemoveNumbers(result)      # Remove any numbers
    result = RemovePunctuations(result) # Remove any punctuations
    result = RemoveStopWords(result)    # Remove stop words
    return result

In [24]:
# Lists to store wiki orgaization names
wiki_org_ls = []

# Set start time to calculate compute time
start_time = time.time()

# PASS-1: Compute NER with wiki page links - Former players with CTE confirmed post-mortem
doc = nlp(NormalizeText(wiki_page_object.content))                                 

# Extract ORG entities
for ent in doc.ents:
    if (ent.type == 'ORG'):
        wiki_org_ls.append(ent.text)
            
# Dedupe list contents
wiki_org_ls = [*set(wiki_org_ls)]

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 
print(f'# of organizations: {len(wiki_org_ls)}')

Exec time --- 5.689316749572754 seconds ---
# of organizations: 14


In [25]:
# Display Wiki page Organizations
wiki_org_df = pd.DataFrame(data= wiki_org_ls, columns=['wiki_org_name'])
wiki_org_df['ord_id'] = wiki_org_df.index+1
wiki_org_df

Unnamed: 0,wiki_org_name,ord_id
0,Atlanta Falcons,1
1,Canadian Football League CFL,2
2,CTE The Brain Bank,3
3,Colts,4
4,The Boston University School Medicine BUSM,5
5,National Institute Occupational Safety Health NIOSH,6
6,NFL All Pro,7
7,NFL,8
8,National Football League,9
9,CTE,10


#### Assign Parent Node & Direction to Players DataFrame

In [None]:
ab_fullname_cand['parent']     =[wiki_org_df.query("wiki_org_name == 'NFL'")['wiki_org_name'].values[0]] * len(ab_fullname_cand)
ab_fullname_cand['direction']  =['parent_to_child'] * len(ab_fullname_cand)
ab_fullname_cand['ltable_Age'] = pd.to_numeric(ab_fullname_cand.ltable_Age, downcast='integer')
ab_fullname_cand

Unnamed: 0,_id,ltable_Player Id,rtable_rec_id,ltable_Name,ltable_Age,ltable_Current Status,ltable_Birthday,ltable_College,ltable_High School,ltable_Clean_Name,rtable_Clean_Name,rtable_cte_category,parent,direction
0,0,clarenceverdin/2503464,841,"Verdin, Clarence",53.0,Retired,6/14/1963,Louisiana-Lafayette,,clarence verdin,clarence verdin,players_nfl_lawsuits,NFL,parent_to_child
1,1,kevinlewis/2519456,406,"Lewis, Kevin",37.0,Retired,4/26/1980,Virginia Tech,,kevin lewis,kevin lewis,players_nfl_lawsuits,NFL,parent_to_child
2,2,kevinlewis/2519457,406,"Lewis, Kevin",50.0,Retired,11/14/1966,Northwestern State-Louisiana,,kevin lewis,kevin lewis,players_nfl_lawsuits,NFL,parent_to_child
3,3,kevinlewis/2503991,406,"Lewis, Kevin",38.0,Retired,10/6/1978,Duke,,kevin lewis,kevin lewis,players_nfl_lawsuits,NFL,parent_to_child
4,4,chrischambers/2504695,875,"Chambers, Chris",38.0,Retired,8/12/1978,Wisconsin,,chris chambers,chris chambers,players_nfl_lawsuits,NFL,parent_to_child
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,726,walterroberts/2524286,1000,"Roberts, Walter",75.0,Retired,2/15/1942,San Jose State,,walter roberts,walter roberts,players_nfl_lawsuits,NFL,parent_to_child
727,727,donnyanderson/2508485,1303,"Anderson, Donny",74.0,Retired,5/16/1943,Texas Tech,,donny anderson,donny anderson,players_nfl_lawsuits,NFL,parent_to_child
728,728,neilgraff/2515140,1288,"Graff, Neil",67.0,Retired,1/12/1950,Wisconsin,,neil graff,neil graff,players_nfl_lawsuits,NFL,parent_to_child
729,729,lamarcampbell/2499977,1642,"Campbell, Lamar",40.0,Retired,8/29/1976,Wisconsin,,lamar campbell,lamar campbell,players_nfl_lawsuits,NFL,parent_to_child


In [32]:
ab_fullname_cand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _id                    731 non-null    int64  
 1   ltable_Player Id       731 non-null    object 
 2   rtable_rec_id          731 non-null    int64  
 3   ltable_Name            731 non-null    object 
 4   ltable_Age             680 non-null    float64
 5   ltable_Current Status  731 non-null    object 
 6   ltable_Birthday        727 non-null    object 
 7   ltable_College         731 non-null    object 
 8   ltable_High School     13 non-null     object 
 9   ltable_Clean_Name      731 non-null    object 
 10  rtable_Clean_Name      731 non-null    object 
 11  rtable_cte_category    731 non-null    object 
 12  parent                 731 non-null    object 
 13  direction              731 non-null    object 
dtypes: float64(1), int64(2), object(11)
memory usage: 80.1+ KB

### Construct Neo4j Node CSV File

In [28]:
def processNodes(data, node_file):
    nodes = {}
    counter = 1
    node_header = [":ID", "Name", "PlayerID" ,"Age", "Birthday", "Status", "College", ":LABEL"]
            
    # Set start time to calculate compute time
    start_time = time.time()

     # Construct node map:
    for index, row in data.iterrows():           
        parent_node_id  = row.parent
        child_node_id   = row['ltable_Player Id']
        
        if parent_node_id is None or child_node_id is None:
            continue;
        
        # Check if parent node already mapped, otherwise add 
        if not bool([i for i in nodes if nodes[i][0] == parent_node_id]):       
            nodes[counter]   = [parent_node_id, parent_node_id,'','','','', parent_node_id]
            counter+=1       
        
        # Check if child node already mapped, otherwise add 
        if not bool([i for i in nodes if nodes[i][0] == child_node_id]):
            nodes[counter]  = [row['ltable_Clean_Name'] if child_node_id != 'NFL' else ''
                               , child_node_id if child_node_id != 'NFL' else ''
                               , row['ltable_Age'] if (child_node_id != 'NFL' and row['ltable_Age'] == row['ltable_Age']) else ''
                               , row['ltable_Birthday'] if (child_node_id != 'NFL' and row['ltable_Birthday'] == row['ltable_Birthday']) else ''
                               , row['ltable_Current Status'] if child_node_id != 'NFL' else ''
                               , row['ltable_College'] if child_node_id != 'NFL' else ''
                               , row['ltable_Clean_Name'] if child_node_id != 'NFL' else ''
                              ]
            counter+=1                   
            
    # write nodes CSV file 
    with open(node_file, 'w',  newline='') as f:
        writer = csv.writer(f)
        writer.writerow(node_header)
        for node in nodes:
            if (nodes[node][0] == 'NFL'):
                writer.writerow([node, nodes[node][0], nodes[node][0],'','','','', nodes[node][6]])
            else:
                writer.writerow([node, nodes[node][0], nodes[node][1],nodes[node][2],nodes[node][3],nodes[node][4],nodes[node][5],nodes[node][6]])
                
    # compute execution time        
    exec_time = time.time() - start_time
    
    return nodes, exec_time

In [29]:
# Set data dump path for Neo4j 
neo4j_data_path = "/Users/camm/Library/NEO4J_HOME/import"

# Construct Node CSV file
node_map, exec_time = processNodes(ab_fullname_cand.copy(), neo4j_data_path+"/CTE_Nodes.csv") 
print("Exec time --- %s seconds ---" % exec_time)

Exec time --- 0.08860325813293457 seconds ---


### Construct Neo4j Relations CSV File

In [30]:
def processRelations(data, nodes, rel_file):
    relation_header = [":START_ID",":END_ID",":TYPE"]
    relation_data = []
    
    # Set start time to calculate compute time
    start_time = time.time() 
    
     # Construct relation map:
    for index, row in data.iterrows(): 
        if (row.direction == 'parent_to_child'):                
            # relation_data.append([nodes[list(nodes.keys()) [list(nodes.values()).index(row.parent)]], 
            #                       nodes[list(nodes.keys()) [list(nodes.values()).index(row['ltable_Player Id'])]], 
            #                       row['rtable_cte_category']]) 
            
            relation_data.append([[i for i in nodes if nodes[i][1] == row.parent][0], 
                                  [i for i in nodes if nodes[i][1] == row['ltable_Player Id']][0], 
                                  row['rtable_cte_category']])             
        else:
            # relation_data.append([nodes[list(node_map.keys()) [list(nodes.values()).index(row['ltable_Player Id'])]], 
            #                       nodes[list(node_map.keys()) [list(nodes.values()).index(row.parent)]], 
            #                       row['rtable_cte_category']]) 

            relation_data.append([[i for i in nodes if nodes[i][0] == row['ltable_Player Id']][0],
                                 [i for i in nodes if nodes[i][0] == row.parent][0],
                                  row['rtable_cte_category']])    
            
    # wirte relation file 
    with open(rel_file, 'w',  newline='') as f:
        writer = csv.writer(f)
        writer.writerow(relation_header)
        writer.writerows(relation_data)         
            
    # compute execution time        
    exec_time = time.time() - start_time
    
    return exec_time

In [31]:
# Construct relation 'cites' & 'family-cites' CSV file
exec_time = processRelations(ab_fullname_cand.copy(), node_map, neo4j_data_path+"/CTE_Relations.csv")
print("Exec time --- %s seconds ---" % exec_time)

Exec time --- 3.757377862930298 seconds ---


## Text to Knowledge Graph - Demo Example for Group

In [11]:
import sqlalchemy, psycopg2
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

sentences = nltk.sent_tokenize(wiki_page_object.content)

In [10]:
# ----------------------------------------------------
# Change below creds/config per your own DB settings:
# ----------------------------------------------------
db_host     = 'awesome-hw.sdsc.edu'   # <- enter your DB host name
db_name     = 'postgres'              # <- enter your DB name
db_username = 'ag_class'              # <- enter your DB username
db_password = 'WUcgdfQ1'              # <- enter your DB password

In [12]:
# Create a postgresql engine instance
print('Connection string: postgresql://' + db_username +':' + db_password + '@' + db_host + '/' + db_name)
alchemyEngine  = sqlalchemy.create_engine('postgresql://' + db_username +':' + db_password + '@' + db_host + '/' + db_name)
%reload_ext sql
%sql $alchemyEngine.url

Connection string: postgresql://ag_class:WUcgdfQ1@awesome-hw.sdsc.edu/postgres


'Connected: ag_class@postgres'

In [38]:
# Set start time to calculate compute time
start_time = time.time()

# Connect to PostgreSQL server
dbConnection    = alchemyEngine.connect();

# SQL command
sql = """
        SELECT DISTINCT title, news, keywords from usnewspaper WHERE ARRAY['cte','lawsuit']::text[] <@ keywords AND news IS NOT NULL
        UNION
        /*
        SELECT DISTINCT title, news, keywords  from usnewspaper WHERE ARRAY['nfl', 'helmet']::text[] <@ keywords AND news IS NOT NULL
        UNION
        SELECT DISTINCT title, news, keywords  from usnewspaper WHERE ARRAY['nfl', 'brain']::text[] <@ keywords AND news IS NOT NULL
        UNION */
        SELECT DISTINCT title, news, keywords from usnewspaper WHERE ARRAY['encephalopathy']::text[] <@ keywords 
             AND news IS NOT NULL AND title ilike '%nfl%';
      """

# Read data from PostgreSQL database table and load into a DataFrame instance
news_data_df = pd.read_sql_query(sqlalchemy.text(sql), alchemyEngine)

# Close the database connection
dbConnection.close();

pd.set_option('display.expand_frame_repr', False);

print("Exec time --- %s seconds ---" % (time.time() - start_time)) 

# Display the DataFrame
display(news_data_df.head())

Exec time --- 8.779508113861084 seconds ---


Unnamed: 0,title,news,keywords
0,Q&A: Here’s what jury must consider in USC’s Matt Gee llandmark CTE lawsuit against the NCAA,"Following a month of wildly disparate testimony, the landmark wrongful death suit brought by the widow of former USC linebacker Matthew Gee against the NCAA was sent to the jury Monday following c...","[matt, jury, lawsuit, testified, gee, uscs, gees, qa, heres, cte, death, usc, ncaa, football, llandmark, case]"
1,Former NFL WR Demaryius Thomas had stage 2 CTE at time of death: ‘It was horrible to see him struggle’,"Demaryius Thomas had stage 2 CTE at the time of his death.\n\nThe former NFL receiver, who was found dead in Georgia home on Dec. 9, was posthumously diagnosed, his family and the Concussion Legac...","[wr, struggle, cte, family, stage, traumatic, dont, encephalopathy, thomas, mckee, death, symptoms, nfl, demaryius, horrible]"
2,Brian Urlacher says some ex-NFL players claim to have CTE to ‘be in the f–king lawsuit’,"Hall of Fame linebacker Brian Urlacher made glaring comments during a recent podcast appearance, in which he appeared to suggest that some former NFL players have falsely claimed to have CTE.\n\n“...","[cte, lawsuit, theres, brian, players, fking, claim, urlacher, exnfl, nfl, brain, thomas, feel, know, guys]"
3,"Former NFL receiver, 33, had stage 2 CTE at time of death, tests reveal","Demaryius Thomas had stage 2 CTE at the time of his death.\n\nThe former NFL receiver, who was found dead in his Georgia home on Dec. 9, was posthumously diagnosed, his family and the Concussion L...","[encephalopathy, mckee, dont, death, symptoms, traumatic, cte, tests, thomas, family, stage, reveal, nfl, 33, receiver]"
4,Ashley Massaro wanted to donate her brain to CTE research. Here`s what we know about head trauma in pro wrestling,"(CNN) After former WWE star Ashley Massaro died last month, her lawyer announced that she wanted her brain donated to CTE research, rekindling the conversation about the risk of head trauma in pro...","[wrestler, donate, concussion, wrestling, brain, head, wrestlers, wwe, pro, kyros, trauma, cte, lawsuit, heres, wanted, research, know, massaro]"


In [201]:
def GetEntities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

  #############################################################
  
    for tok in nlp(sent):
    ## chunk 2
        # # if token is a punctuation mark then move on to the next token
        # if tok.dep_ != "punct":
        
        # check: token is a compound word or not
        if tok.dep_ == "compound":
            prefix = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                prefix = prv_tok_text + " "+ tok.text
      
        # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier = prv_tok_text + " "+ tok.text
      
        ## chunk 3
        if tok.dep_.find("subj") == True:
            ent1 = modifier +" "+ prefix + " "+ tok.text
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""      

        ## chunk 4
        if tok.dep_.find("obj") == True:
            ent2 = modifier +" "+ prefix +" "+ tok.text
        
        ## chunk 5  
        # update variables
        prv_tok_dep = tok.dep_
        prv_tok_text = tok.text
  #############################################################

    return [ent1.strip(), ent2.strip()]


def GetRelation(sent):

    doc = nlp(sent)

    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    #define the pattern 
    pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}]
        
    matcher.add("matching_1", [pattern],  greedy='LONGEST') 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [211]:
sentences = nltk.sent_tokenize(RemoveStopWords(re.sub("\n|\r", " ", news_data_df['news'].replace(r'[^\W\s]+(?<!.)', '', regex=True)[1].replace("“","").replace("”","").replace("’","").replace("'","").lower())))
sentences[1]

'former nfl receiver , found dead georgia home dec. 9 , posthumously diagnosed , family concussion legacy foundation announced tuesday .'

In [208]:
for i in range(0,len(sentences)):
    if(GetEntities(str(sentences[i]))[0] != '' and GetEntities(str(sentences[i]))[1] !=''):
        print(f"Ents: {GetEntities(str(sentences[i]))} \t\t\tRels: {GetRelation(str(sentences[i]))}")

Ents: ['posthumously concussion legacy foundation', 'dead georgia home'] 			Rels: announced
Ents: ['cte traumatic center', 'associated paranoia depression'] 			Rels: determined
Ents: ['katina statement', 'changes'] 			Rels: said
Ents: ['families', 'football'] 			Rels: hope
Ents: ['parents', 'children'] 			Rels: want
Ents: ['coroners office', 'yet  death'] 			Rels: believed
Ents: ['university research thomas', 'associated  cte'] 			Rels: said
Ents: ['cte', 'death'] 			Rels: cause
Ents: ['mckee', 'abc news'] 			Rels: told
Ents: ['cte', 'behavior personality'] 			Rels: changes
