In [3]:
#!pip install tldextract
import tldextract as tld
import pandas as pd
import matplotlib as plt

In [4]:
def extract_domain(link):
    link = tld.extract(link)
    return link.domain

def extract_suffix(link):
    link = tld.extract(link)
    return link.suffix

def extract_subdomain(link):
    link = tld.extract(link)
    return link.subdomain

def get_host(link):
    domain = extract_domain(link)
    suffix = extract_suffix(link)
    host = str(domain) + '.' + str(suffix)
    return host

In [5]:
bearbeitet = pd.read_csv('guete_bearbeitet.csv', index_col = 0)
institutes = pd.read_csv('institutes.csv', usecols=['grid_id','name'])
links = pd.read_csv('links.csv')
relationships = pd.read_csv('relationships.csv', usecols=['relationship_type', 'related_grid_id'])

In [6]:
name_and_link = institutes.merge(links, on='grid_id')

In [7]:
name_and_link['subdomain'] = name_and_link['link'].apply(extract_subdomain)
name_and_link['domain'] = name_and_link['link'].apply(extract_domain)
name_and_link['suffix'] = name_and_link['link'].apply(extract_suffix)
name_and_link['host'] = name_and_link['link'].apply(get_host)

bearbeitet['mail_host'] = bearbeitet['email'].apply(get_host)

In [8]:
child = relationships[relationships['relationship_type'] == 'Child']
child = name_and_link.merge(child, how='left', left_on='grid_id', right_on='related_grid_id')
#print(child.head(200))
parent = child[child.relationship_type != 'Child']
#print(parent.head(200))
name_and_link = parent

In [9]:
print(name_and_link.head())

       grid_id                            name                      link  \
0  grid.1001.0  Australian National University    http://www.anu.edu.au/   
1  grid.1002.3               Monash University    http://www.monash.edu/   
2  grid.1003.2        University of Queensland     http://www.uq.edu.au/   
3  grid.1004.5            Macquarie University         http://mq.edu.au/   
4  grid.1005.4                  UNSW Australia  https://www.unsw.edu.au/   

  subdomain  domain  suffix         host relationship_type related_grid_id  
0       www     anu  edu.au   anu.edu.au               NaN             NaN  
1       www  monash     edu   monash.edu               NaN             NaN  
2       www      uq  edu.au    uq.edu.au               NaN             NaN  
3                mq  edu.au    mq.edu.au               NaN             NaN  
4       www    unsw  edu.au  unsw.edu.au               NaN             NaN  


In [10]:
name_and_link.to_csv('name_and_link_test.csv')

In [11]:
join_host_and_bearbeitet = pd.merge(bearbeitet, name_and_link, how='left', left_on='mail_host', right_on='host')

In [12]:
print(join_host_and_bearbeitet.head())

              email                                            tdm_url  \
0  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
1  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
2  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
3  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
4        bruker.com  http://link.springer.com/article/10.1007/s1183...   

                time_stamp    mine_method                          doi  \
0  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
1  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
2  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
3  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
4  2018-12-18 08:00:29 UTC  Springer HTML    10.1007/s11837-018-2752-0   

   recall  precision comment         mail_host        grid_id  \
0    True       True     NaN  meduniwien.ac.a

In [13]:
join_host_and_bearbeitet = join_host_and_bearbeitet.sort_values(by=['email']).reset_index(drop=True)

In [14]:
join_host_and_bearbeitet.to_csv('merge_v2.csv')

In [24]:
show_result = pd.read_csv('merge_v2.csv', index_col = 0, usecols = ['email','name', 'grid_id'])

In [25]:
print(show_result.to_string())

                                  grid_id                                               name
email                                                                                       
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
126.com                               NaN                                                NaN
Outlook.com                           NaN                                                NaN
aau.in                      grid.411373.3                      Anand Agricultural University
adm.cgmh.org.tw             grid.454209.e               Keelung Chang 

In [17]:
print(get_host('wp.pl'))

wp.pl


In [18]:
join_host_and_bearbeitet['grid_id'].isnull().sum()

64

In [19]:
(join_host_and_bearbeitet['email'] == ' ').sum()

5

In [20]:
print(len(join_host_and_bearbeitet))

325


In [21]:
print((join_host_and_bearbeitet['grid_id'].isnull().sum()/len(join_host_and_bearbeitet))*100)

19.692307692307693


In [22]:
print(len(join_host_and_bearbeitet['name'].unique()))

209


In [23]:
print(len(join_host_and_bearbeitet['email'].unique()))

173


In [None]:
# to do: häufigere fehlende email matchings dadurch zu erklären, dass child fehlt -> ergänzen
#        plot
#        string vergleich der subdomain nach länge