In [117]:
#!pip install tldextract
import tldextract as tld
import pandas as pd
import matplotlib as plt
import numpy as np

In [118]:
def extract_domain(link):
    link = tld.extract(link)
    return link.domain

def extract_suffix(link):
    link = tld.extract(link)
    return link.suffix

def extract_subdomain(link):
    link = tld.extract(link)
    return link.subdomain

def get_host(link):
    domain = extract_domain(link)
    suffix = extract_suffix(link)
    host = str(domain) + '.' + str(suffix)
    return host

In [119]:
bearbeitet = pd.read_csv('guete_bearbeitet.csv', index_col = 0)
institutes = pd.read_csv('institutes.csv', usecols=['grid_id','name'])
links = pd.read_csv('links.csv')
relationships = pd.read_csv('relationships.csv', usecols=['relationship_type', 'related_grid_id'])

In [120]:
name_and_link = institutes.merge(links, on='grid_id')

In [121]:
name_and_link['subdomain'] = name_and_link['link'].apply(extract_subdomain)
name_and_link['domain'] = name_and_link['link'].apply(extract_domain)
name_and_link['suffix'] = name_and_link['link'].apply(extract_suffix)
name_and_link['host'] = name_and_link['link'].apply(get_host)

bearbeitet['mail_host'] = bearbeitet['email'].apply(get_host)

In [122]:
child = relationships[relationships['relationship_type'] == 'Child']
child = name_and_link.merge(child, how='left', left_on='grid_id', right_on='related_grid_id')
#print(child.head(200))
parent = child[child.relationship_type != 'Child']
#print(parent.head(200))
name_and_link = parent

In [123]:
print(name_and_link.head())

       grid_id                            name                      link  \
0  grid.1001.0  Australian National University    http://www.anu.edu.au/   
1  grid.1002.3               Monash University    http://www.monash.edu/   
2  grid.1003.2        University of Queensland     http://www.uq.edu.au/   
3  grid.1004.5            Macquarie University         http://mq.edu.au/   
4  grid.1005.4                  UNSW Australia  https://www.unsw.edu.au/   

  subdomain  domain  suffix         host relationship_type related_grid_id  
0       www     anu  edu.au   anu.edu.au               NaN             NaN  
1       www  monash     edu   monash.edu               NaN             NaN  
2       www      uq  edu.au    uq.edu.au               NaN             NaN  
3                mq  edu.au    mq.edu.au               NaN             NaN  
4       www    unsw  edu.au  unsw.edu.au               NaN             NaN  


In [124]:
name_and_link.to_csv('name_and_link_test.csv')

In [125]:
join_host_and_bearbeitet = pd.merge(bearbeitet, name_and_link, how='left', left_on='mail_host', right_on='host')

In [126]:
print(join_host_and_bearbeitet.head())

              email                                            tdm_url  \
0  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
1  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
2  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
3  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
4        bruker.com  http://link.springer.com/article/10.1007/s1183...   

                time_stamp    mine_method                          doi  \
0  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
1  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
2  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
3  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ddtec.2014.03.009   
4  2018-12-18 08:00:29 UTC  Springer HTML    10.1007/s11837-018-2752-0   

   recall  precision comment         mail_host        grid_id  \
0    True       True     NaN  meduniwien.ac.a

In [127]:
join_host_and_bearbeitet = join_host_and_bearbeitet.sort_values(by=['email']).reset_index(drop=True)

In [128]:
add_missings = join_host_and_bearbeitet[join_host_and_bearbeitet['grid_id'].isnull()]

In [129]:
name_and_link_missings = institutes.merge(links, on='grid_id')
name_and_link_missings['subdomain'] = name_and_link_missings['link'].apply(extract_subdomain)
name_and_link_missings['domain'] = name_and_link_missings['link'].apply(extract_domain)
name_and_link_missings['suffix'] = name_and_link_missings['link'].apply(extract_suffix)
name_and_link_missings['host'] = name_and_link_missings['link'].apply(get_host)

In [130]:
add_missings = add_missings.drop(['subdomain', 'domain', 'suffix', 'host', 'grid_id', 'related_grid_id', 'name', 'link', 'relationship_type'], axis=1)
add_missings = add_missings.merge(name_and_link_missings, how='left', left_on='mail_host', right_on='host')

In [131]:
add_missings = add_missings.drop_duplicates().reset_index(drop=True)

In [132]:
join_host_and_bearbeitet = join_host_and_bearbeitet[pd.notnull(join_host_and_bearbeitet['grid_id'])]

In [133]:
join_host_and_bearbeitet = pd.concat([join_host_and_bearbeitet, add_missings])
join_host_and_bearbeitet = join_host_and_bearbeitet.sort_values(by=['email']).reset_index(drop=True)

In [134]:
print(join_host_and_bearbeitet[['email', 'name']].to_string())

                          email                                               name
0                                                                              NaN
1                                                                              NaN
2                                                                              NaN
3                                                                              NaN
4                                                                              NaN
5                       126.com                                                NaN
6                   Outlook.com                                                NaN
7                        aau.in                      Anand Agricultural University
8               adm.cgmh.org.tw                Linkou Chang Gung Memorial Hospital
9               adm.cgmh.org.tw               Keelung Chang Gung Memorial Hospital
10              adm.cgmh.org.tw               Taoyuan Chang Gung Memorial Hospital
11  

In [135]:
join_host_and_bearbeitet.to_csv('merge_v2.csv')

In [136]:
show_result = pd.read_csv('merge_v2.csv', index_col = 0, usecols = ['email','name', 'grid_id'])

In [137]:
print(show_result.to_string())

                                  grid_id                                               name
email                                                                                       
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
                                      NaN                                                NaN
126.com                               NaN                                                NaN
Outlook.com                           NaN                                                NaN
aau.in                      grid.411373.3                      Anand Agricultural University
adm.cgmh.org.tw             grid.454211.7                Linkou Chang 

In [138]:
print(get_host('wp.pl'))

wp.pl


In [139]:
join_host_and_bearbeitet['grid_id'].isnull().sum()

45

In [140]:
(join_host_and_bearbeitet['email'] == ' ').sum()

5

In [141]:
print(len(join_host_and_bearbeitet))

504


In [151]:
print((join_host_and_bearbeitet['grid_id'].isnull().sum()/len(join_host_and_bearbeitet['doi']))*100)

8.928571428571429


In [143]:
print(len(join_host_and_bearbeitet['name'].unique()))

400


In [144]:
print(len(join_host_and_bearbeitet['email'].unique()))

173


In [145]:
# to do: häufigere fehlende email matchings dadurch zu erklären, dass child fehlt -> ergänzen
#        plot
#        string vergleich der subdomain nach länge

In [146]:
print(len(join_host_and_bearbeitet['doi'].unique()))

200


In [147]:
host = join_host_and_bearbeitet['host'].unique()
#print(len(host))
dataframes = []
for host in join_host_and_bearbeitet.host:
    filter_host = join_host_and_bearbeitet[join_host_and_bearbeitet.host == host]
    filter_subdomain = filter_host['subdomain'].values
    min_length_subdomain = 100
    min_subdomain = ''
    for element in filter_subdomain:
        if len(element) < min_length_subdomain:
            min_length_subdomain = len(element)
            min_subdomain = element
        else:
            continue
    save_dataframe = filter_host[filter_host.subdomain == min_subdomain]
    dataframes.append(save_dataframe)
    

solution = pd.concat(dataframes)
solution = solution.drop_duplicates().reset_index(drop=True)
print(solution[['email', 'name']])

                   email                                               name
0                 aau.in                      Anand Agricultural University
1        adm.cgmh.org.tw               Keelung Chang Gung Memorial Hospital
2        adm.cgmh.org.tw               Taoyuan Chang Gung Memorial Hospital
3        adm.cgmh.org.tw             Kaohsiung Chang Gung Memorial Hospital
4        adm.cgmh.org.tw                Chiayi Chang Gung Memorial Hospital
5        adm.cgmh.org.tw                       Chang Gung Memorial Hospital
6        adm.cgmh.org.tw                     Chang Gung Children's Hospital
7             agh.edu.pl           AGH University of Science and Technology
8                 amc.nl                            Academic Medical Center
9             amc.uva.nl                            University of Amsterdam
10            amc.uva.nl                            University of Amsterdam
11            amc.uva.nl                            University of Amsterdam
12          

In [148]:
print(len(solution['doi'].unique()))

155


In [149]:
solution['grid_id'].isnull().sum()

0

In [150]:
solution.to_csv('solution.csv')