-
Notifications
You must be signed in to change notification settings - Fork 3
/
OCMeta_coverage_classes.py
104 lines (77 loc) · 4.45 KB
/
OCMeta_coverage_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from utils import *
import glob
import os
from tqdm import tqdm
import concurrent.futures
import functools
from process import process_meta_csv, process_file_wrapper
# ============ PROCESSOR CLASSES ============= #
class Processor(object):
def __init__(self, batch_size, max_workers):
self.batch_size = batch_size
self.max_workers = max_workers
def set_batch_size(self, new_batch_size):
self.batch_size = new_batch_size
return True
def get_batch_size(self):
return self.batch_size
def set_max_workers(self, new_max_workers):
self.max_workers = new_max_workers
return True
def get_max_workers(self):
return self.max_workers
class PlayaristsProcessor(Processor):
def __init__(self, batch_size, max_workers, meta_path : str, erih_path : str, doaj_path : str):
self.meta_path = glob.glob(os.path.join(meta_path, "*.csv"))
self.erih_df = load_data(erih_path)
self.doaj_df = load_data(doaj_path)
super().__init__(batch_size, max_workers)
def get_erih_df(self):
return self.erih_df
def get_doaj_df(self):
return self.doaj_df
def process_files(self):
erih_dict = get_erih_plus_dict(self.erih_df)
all_results = []
totalOCMpublications = 0
with tqdm(total=len(self.meta_path), desc="Batches") as pbar:
for i in range(0, len(self.meta_path), self.batch_size):
batch_files = self.meta_path[i:i+self.batch_size]
with concurrent.futures.ProcessPoolExecutor(max_workers=self.max_workers) as executor:
results = executor.map(process_file_wrapper, [(f, erih_dict) for f in batch_files])
all_results.extend(results)
pbar.update(len(batch_files))
results_dict = {}
for filename, result, publications_count in all_results:
results_dict[filename] = result
totalOCMpublications += publications_count
#this is meta-erih merged
final_df = pd.concat(list(results_dict.values()), ignore_index=True)
final_df = final_df.groupby(['OC_omid', 'issn', 'EP_id']).agg({'Publications_in_venue': 'sum'}).reset_index()
#this is meta-erih-doaj merged
new_final_df = process_doaj_file(self.doaj_df, final_df)
#fixing doouble entries by summing publications_in_venue values and unifying on EP_id (11/08)
right_pub_count = new_final_df.groupby(by='EP_id')['Publications_in_venue'].sum()
double_ep_list = list(new_final_df.duplicated(subset="EP_id", keep=False)) # create boolean list where all duplicates are true, store it in a list
new_final_df["doubles"] = double_ep_list# set the list as a new column of df
double_omid = new_final_df.query("doubles == True") #create new df with only double omids
double_omid = double_omid[["OC_omid", "issn"]] # new df with info of duplicates
save_to_results(double_omid, "duplicate_omids.csv") # save as csv
new_final_df = new_final_df.drop_duplicates(subset="EP_id")
new_final_df = pd.merge(right_pub_count, new_final_df, left_on="EP_id", right_on="EP_id", how="left")
new_final_df = new_final_df.sort_values("Publications_in_venue_x", ascending=False)
new_final_df = new_final_df.drop(columns=["Publications_in_venue_y", "doubles"])
new_final_df = new_final_df.rename(columns={"Publications_in_venue_x": "Publications_in_venue"})
print("len new final df")
print(len(new_final_df)) # should be 8583
save_to_results(new_final_df, "SSH_Publications_in_OC_Meta_and_Open_Access_status.csv")
publications = new_final_df['Publications_in_venue'].sum()
OCMeta_coverage = publications / totalOCMpublications
OCMeta_coverage_percent = OCMeta_coverage * 100
print("##### OpenCitations Meta publications in ERIH PLUS: count = ", str(totalOCMpublications), "; ratio = ", str(OCMeta_coverage), "; % = ", str(OCMeta_coverage_percent))
# SSH journals Open Access Status
df_true_values = new_final_df.query("Open_Access == True")
access_count = len(df_true_values)
percentages = (access_count * 100) / len(new_final_df)
print("##### SSH journals Open Access Status : counts ", str(access_count), "; pecentage = ", str(percentages))
return new_final_df