In [106]:
import pandas as pd

In [107]:
hybrid_publications = pd.read_csv('hybrid_publications.csv', usecols=['host'])
bearbeitet = pd.read_csv('guete_bearbeitet.csv', index_col = 0)
institutes = pd.read_csv('institutes.csv', usecols=['grid_id','name'])
links = pd.read_csv('links.csv')

In [108]:
print(hybrid_publications.head())

             host
0  bradford.ac.uk
1         upol.cz
2       uwe.ac.uk
3        sina.com
4     dlmu.edu.cn


In [109]:
print(bearbeitet.head())

              email                                            tdm_url  \
0  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
1        bruker.com  http://link.springer.com/article/10.1007/s1183...   
2          ohsu.edu  https://api.elsevier.com/content/article/PII:S...   
3           cox.net                                                      
4        amc.uva.nl  http://link.springer.com/article/10.1007/s1176...   

                time_stamp    mine_method                           doi  \
0  2018-12-17 16:06:12 UTC   Elsevier XML   10.1016/j.ddtec.2014.03.009   
1  2018-12-18 08:00:29 UTC  Springer HTML     10.1007/s11837-018-2752-0   
2  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ocemod.2015.12.007   
3  2018-12-16 12:29:19 UTC  PMC OAI (NIH)     10.1007/s13752-016-0244-4   
4  2019-01-04 14:40:57 UTC  Springer HTML     10.1007/s11764-018-0690-z   

   recall  precision comment  
0    True       True     NaN  
1    True       True     NaN  
2    True  

In [110]:
print(institutes.head())

       grid_id                            name
0  grid.1001.0  Australian National University
1  grid.1002.3               Monash University
2  grid.1003.2        University of Queensland
3  grid.1004.5            Macquarie University
4  grid.1005.4                  UNSW Australia


In [111]:
print(links.head())

       grid_id                      link
0  grid.1001.0    http://www.anu.edu.au/
1  grid.1002.3    http://www.monash.edu/
2  grid.1003.2     http://www.uq.edu.au/
3  grid.1004.5         http://mq.edu.au/
4  grid.1005.4  https://www.unsw.edu.au/


In [112]:
name_and_link = institutes.merge(links, on='grid_id')

In [113]:
print(name_and_link.head())

       grid_id                            name                      link
0  grid.1001.0  Australian National University    http://www.anu.edu.au/
1  grid.1002.3               Monash University    http://www.monash.edu/
2  grid.1003.2        University of Queensland     http://www.uq.edu.au/
3  grid.1004.5            Macquarie University         http://mq.edu.au/
4  grid.1005.4                  UNSW Australia  https://www.unsw.edu.au/


In [114]:
def extract_host(link):
    link = link.replace('http://', '')
    first_position = link.find('www')
    last_position = link.find('/')
    return link[first_position+4:last_position]

In [115]:
print(extract_host('http://www.kek.de/'))

kek.de


In [116]:
name_and_link['link'] = name_and_link['link'].apply(extract_host)

In [117]:
merge_link_and_host = hybrid_publications.merge(name_and_link, left_on='host', right_on='link')

In [118]:
print(merge_link_and_host.head())

             host      grid_id                    name            link
0  bradford.ac.uk  grid.6268.a  University of Bradford  bradford.ac.uk
1  bradford.ac.uk  grid.6268.a  University of Bradford  bradford.ac.uk
2  bradford.ac.uk  grid.6268.a  University of Bradford  bradford.ac.uk
3  bradford.ac.uk  grid.6268.a  University of Bradford  bradford.ac.uk
4  bradford.ac.uk  grid.6268.a  University of Bradford  bradford.ac.uk


In [119]:
merge_link_and_host = merge_link_and_host.drop_duplicates().reset_index()

In [120]:
print(merge_link_and_host.head())

   index            host        grid_id  \
0      0  bradford.ac.uk    grid.6268.a   
1     41         upol.cz  grid.10979.36   
2     57       uwe.ac.uk    grid.6518.a   
3    138      agh.edu.pl    grid.9922.0   
4    280          jku.at    grid.9970.7   

                                       name            link  
0                    University of Bradford  bradford.ac.uk  
1               Palacký University, Olomouc         upol.cz  
2         University of the West of England       uwe.ac.uk  
3  AGH University of Science and Technology      agh.edu.pl  
4        Johannes Kepler University of Linz          jku.at  


In [121]:
join_host_and_bearbeitet = pd.merge(bearbeitet, merge_link_and_host, how='left', left_on='email', right_on='link')

In [122]:
print(join_host_and_bearbeitet.head())

              email                                            tdm_url  \
0  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
1  meduniwien.ac.at  https://api.elsevier.com/content/article/PII:S...   
2        bruker.com  http://link.springer.com/article/10.1007/s1183...   
3          ohsu.edu  https://api.elsevier.com/content/article/PII:S...   
4          ohsu.edu  https://api.elsevier.com/content/article/PII:S...   

                time_stamp    mine_method                           doi  \
0  2018-12-17 16:06:12 UTC   Elsevier XML   10.1016/j.ddtec.2014.03.009   
1  2018-12-17 16:06:12 UTC   Elsevier XML   10.1016/j.ddtec.2014.03.009   
2  2018-12-18 08:00:29 UTC  Springer HTML     10.1007/s11837-018-2752-0   
3  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ocemod.2015.12.007   
4  2018-12-17 16:06:12 UTC   Elsevier XML  10.1016/j.ocemod.2015.12.007   

   recall  precision comment    index              host        grid_id  \
0    True       True     NaN  

In [123]:
join_host_and_bearbeitet = join_host_and_bearbeitet.sort_values(by=['email'])

In [124]:
join_host_and_bearbeitet.to_csv('merge.csv')

In [128]:
show_result = pd.read_csv('merge.csv', index_col = 0, usecols = ['email','name', 'grid_id'])

In [129]:
print(show_result)

                           grid_id                                      name
email                                                                       
                               NaN                                       NaN
                               NaN                                       NaN
                               NaN                                       NaN
                               NaN                                       NaN
                               NaN                                       NaN
126.com                        NaN                                       NaN
Outlook.com                    NaN                                       NaN
aau.in               grid.411373.3             Anand Agricultural University
adm.cgmh.org.tw                NaN                                       NaN
agh.edu.pl             grid.9922.0  AGH University of Science and Technology
amc.nl                         NaN                                       NaN

In [127]:
name_and_link.to_csv('insti_and_link.csv')

In [138]:
big_merge = pd.merge(join_host_and_bearbeitet, hybrid_publications, left_on='email', right_on='host')

In [139]:
big_merge.to_csv('test.csv')