In [1]:
from os import listdir
from os.path import isfile, join
import os
import pandas
import matplotlib.pyplot as plt
import numpy as np
import multiprocessing
import fos_reader
pool = multiprocessing.Pool()
onlyfiles = [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]

In [2]:
def reader_fos(fileName):
    df = pandas.read_json(fileName, lines=True)
    fos_with_nan_and_wages = df["fos"]
    fos_with_wages = fos_with_nan_and_wages.loc[fos_with_nan_and_wages.apply(lambda x: type(x) is not float)]
    fos = fos_with_wages.apply(lambda row: [field["name"] for field in row])
    return fos

In [3]:
import re
dblp_part_regex = re.compile("author_indexed_fos_*")
part_files_filter_obj = filter(lambda file_name: dblp_part_regex.match(file_name), onlyfiles)

In [4]:
part_files = list(part_files_filter_obj)

In [5]:
list_of_dataframes = list(pool.map(fos_reader.reader_fos, part_files))

In [6]:
df = pandas.concat(list_of_dataframes, ignore_index=True)

In [7]:
lists = df.map(lambda lisT: np.array(lisT))

In [8]:
np_with_fos = np.concatenate(lists.to_numpy())

In [9]:
df_with_fos = pandas.Series(np_with_fos)

In [10]:
counted_fos = df_with_fos.groupby(df_with_fos).count()

In [11]:
counted_fos

0            689
1        1027249
10        168098
100        43831
1000       11027
          ...   
99995          1
99996          1
99997          1
99998          2
99999          2
Length: 107084, dtype: int64

In [12]:
sorted_fos = counted_fos.sort_values(ascending=False)

In [13]:
sorted_fos

17       2606697
1        1027249
12        904733
31        482419
3         420260
          ...   
47073          1
89490          1
89488          1
89487          1
32542          1
Length: 107084, dtype: int64

In [14]:
hot_topics = sorted_fos[:10]

In [15]:
hot_topics

17     2606697
1      1027249
12      904733
31      482419
3       420260
195     377590
122     371681
107     364725
9       346928
52      323070
dtype: int64

In [16]:
import json

with open("fos_index_map.json") as map_file:
    file = map_file.read()
    map_fos_to_int = json.loads(file)

    map_int_to_fos = {value:key for key, value in map_fos_to_int.items()} 

In [17]:
fos_index = hot_topics.reset_index()["index"].apply(lambda fos_number: map_int_to_fos[int(fos_number)])

In [18]:
map_int_to_fos = {value:key for key, value in map_fos_to_int.items()} 

In [19]:
map_int_to_fos[52]

'Discrete mathematics'

In [20]:
ht = hot_topics.reset_index()

In [21]:
ht["index"] = fos_index

In [22]:
ht.columns = ["Name of field of study", "Number of occurences"]

In [23]:
ht.style.set_caption("Tabela 1. Zestawienie najbardziej popularnych słów kluczowych")

Unnamed: 0,Name of field of study,Number of occurences
0,Computer science,2606697
1,Artificial intelligence,1027249
2,Mathematics,904733
3,Machine learning,482419
4,Mathematical optimization,420260
5,Real-time computing,377590
6,Distributed computing,371681
7,Computer vision,364725
8,Pattern recognition,346928
9,Discrete mathematics,323070


In [24]:
len(map_int_to_fos)

107084