In [1]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [4]:
readRDS = robjects.r['readRDS']
df = readRDS('../data/pt_ngrams_severstal_wind_2006_2015_/8_severstal_wind_2006_2015__query_results.RDS')
df = pandas2ri.ri2py(df)

In [48]:
import os

folder = 'severstal'
if not os.path.exists(folder):
    os.makedirs(folder)

for i, name in enumerate(df.names):
    if type(df[i]) == robjects.vectors.DataFrame:
        df[i].to_csvfile('./%s/%s.csv' % (folder, name))

In [5]:
for pos1, name1 in enumerate(df.names):
    print('[%d] %s' % (pos1, name1))
    if not df[pos1].names:
        print('    NULL')
    else:
        for pos2, name2 in enumerate(df[pos1].names):
            print('    [%d] %s' % (pos2, name2))
    print()

[0] specificity_statistics
    [0] id
    [1] ngram
    [2] ngramids
    [3] corpus_sum
    [4] ngram_length
    [5] specificity
    [6] in_degree
    [7] out_degree
    [8] cluster
    [9] n_types

[1] occurrence_by_year
    [0] ngram
    [1] pubyear
    [2] count
    [3] rel_freq

[2] cooccurrence_by_year
    [0] ngram1
    [1] ngram2
    [2] pubyear
    [3] count

[3] corpus_occurrence_by_year
    [0] pubyear
    [1] freq

[4] technology_cooccurrence
    NULL



In [9]:
import pandas as pd
source = pd.Series(list(df[2][0]))
target = pd.Series(list(df[2][1]))

In [8]:
from collections import Counter
def connections(ngram):
    return Counter(list(target[source == ngram]) + list(source[target == ngram]))

In [10]:
connections('Adipic acid')

Counter({'Biodegradation': 1,
         'Casting (metalworking)': 1,
         'Chemical structure': 1,
         'Corrosion inhibitor': 2,
         'Glutaric acid': 2,
         'Malonic acid': 2,
         'Oxalic acid': 2,
         'Property damage': 1,
         'Sebacic acid': 2,
         'Succinic acid': 2})

In [11]:
len(connections('Casting (metalworking)'))

1

In [12]:
ngrams = set()
for ngram in df[0][1]:
    ngrams.add(ngram)
for ngram in df[1][0]:
    ngrams.add(ngram)
for ngram in df[2][0]:
    ngrams.add(ngram)
for ngram in df[2][1]:
    ngrams.add(ngram)
ngrams = list(ngrams)
ngrams_ids = {ngram: pos for pos, ngram in enumerate(ngrams)}
len(ngrams)

2877

In [13]:
years = set()
for year in df[1][1]:
    years.add(year)
for year in df[2][2]:
    years.add(year)
for year in df[3][0]:
    years.add(year)
years = list(sorted(years))
years_ids = {year: pos for pos, year in enumerate(years)}
min_year, max_year = min(years), max(years)
min_year, max_year

(2005, 2015)

In [14]:
res = {}

In [15]:
res['edge_source'] = []
res['edge_target'] = []
res['edge_date'] = []
res['edge_weight'] = []
for ngram1, ngram2, year, count in zip(df[2][0], df[2][1], df[2][2], df[2][3]):
    res['edge_source'].append(ngrams_ids[ngram1])
    res['edge_target'].append(ngrams_ids[ngram2])
    res['edge_date'].append(int(year))
    res['edge_weight'].append(int(count))

In [16]:
res['node_label'] = []
for ngram in ngrams:
    res['node_label'].append(ngram)

In [17]:
res['nodeDate_node'] = []
res['nodeDate_date'] = []
res['nodeDate_count'] = []
for ngram, year, count in zip(df[1][0], df[1][1], df[1][2]):
    res['nodeDate_node'].append(ngrams_ids[ngram])
    res['nodeDate_date'].append(year)
    res['nodeDate_count'].append(int(count))

In [11]:
import json
json.dump(res, open('../data/pt_ngrams_severstal_wind_2006_2015_.json', 'w'))