Skip to content

Commit

Permalink
Merge pull request #62 from phyng/dev
Browse files Browse the repository at this point in the history
feat(corr)
  • Loading branch information
phyng committed Jan 14, 2024
2 parents 2f4c73d + a701618 commit 5037815
Show file tree
Hide file tree
Showing 13 changed files with 1,859 additions and 3,333 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/githubpages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ jobs:
touch build/html/.nojekyll
- name: Deploy
uses: JamesIves/github-pages-deploy-action@releases/v3
uses: JamesIves/github-pages-deploy-action@releases/v4
with:
ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
BRANCH: gh-pages # The branch the action should deploy to.
FOLDER: build/html # The folder the action should deploy.
folder: build/html # The folder the action should deploy.
4,857 changes: 1,543 additions & 3,314 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ scilib-gender-benchmark = "scilib.gender.benchmark.benchmark:run"
[tool.poetry.dependencies]
python = ">=3.9,<3.12"
pandas = "^1.5.3"
jupyter = "^1.0.0"
numpy = "^1.24.3"
xlrd = "^2.0.1"
matplotlib = "^3.7.1"
Expand All @@ -34,7 +33,6 @@ pypinyin = "^0.48"
tabulate = "^0.9.0"
pyquery = "^2.0.0"
libs = "^0.0.10"
jupyterlab = "^3.6.3"
nameparser = "^1.1.2"
Unidecode = "^1.3.6"
aiohttp = "^3.8.4"
Expand Down Expand Up @@ -64,5 +62,5 @@ db = ["redis", "psycopg2", "SQLAlchemy", "pandas_access"]
gender = ["agefromname", "gender-guesser", "genderizer", "naiveBayesClassifier"]

[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
3 changes: 2 additions & 1 deletion scilib/antv/sankey.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import unicode_literals, absolute_import, print_function, division

import json
from typing import List
import pandas as pd
from optparse import OptionParser
from collections import Counter
Expand Down Expand Up @@ -35,7 +36,7 @@ def make_matrix_csv(nodes, edges):
return '\n'.join(lines)


def make_sankey_data(nodes, edges):
def make_sankey_data(nodes: List[str], edges):
data_nodes = [dict(name=node) for node in nodes]
counter = Counter(edges)
data_links = []
Expand Down
11 changes: 9 additions & 2 deletions scilib/cnki/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,16 @@ def parse_txt_file(file_path):
return articles.values()


def read_text_format_dir(from_dir):
def read_text_format_dir(from_dir, keyword_replace_map=None):
for file in Path(from_dir).glob('**/*.txt'):
yield from parse_txt_file(file)
for item in parse_txt_file(file):
item['fu_tokens'] = parse_fu_tokens(item)
item['keyword_tokens'] = parse_keyword_tokens(item, keyword_replace_map=keyword_replace_map)
item['parsed_year'] = parse_year(item)
item['clc_tokens'] = parse_clc_tokens(item)
item['clc_level1_tokens'] = parse_clc_level1_tokens(item)
item['clc_level2_tokens'] = parse_clc_level2_tokens(item)
yield item


def read_spider_format(file_path, *, fields=None, keyword_replace_map=None):
Expand Down
25 changes: 22 additions & 3 deletions scilib/cnki/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,11 @@ def report_cnki_keywords(
cnki_items,
*,
outpur_dir,
top_size=50,
):
counter, corrs = get_corrs([item["keyword_tokens"] for item in cnki_items])
counter, corrs = get_corrs([item["keyword_tokens"] for item in cnki_items], top_size=top_size)
top_n = [token for token, _ in counter.most_common(top_size)]

corrs_csv_string = corrs_to_csv_string(corrs)
with open(os.path.join(outpur_dir, "keywords.corrs.csv"), "w") as f:
f.write(corrs_csv_string)
Expand All @@ -125,16 +128,27 @@ def report_cnki_keywords(
with open(os.path.join(outpur_dir, "keywords.cortext.csv"), "w") as f:
f.write(cortext_network_to_csv_string(cortext_network))

# keywords_year_extend
top_keywords_year_extend = []

# cortext network with year
networks = {}
for year in sorted(set([item["parsed_year"] for item in cnki_items if item["parsed_year"]])):
year_items = [item for item in cnki_items if item["parsed_year"] == year]
_, year_corrs = get_corrs([item["keyword_tokens"] for item in year_items])
year_counter, year_corrs = get_corrs([item["keyword_tokens"] for item in year_items])
networks[int(year)] = corrs_to_cortext_network(year_corrs)
for k, v in year_counter.most_common():
if k in top_n:
top_keywords_year_extend.extend(dict(year=year, keyword=k) for i in range(v))

year_cortext_networks = merge_year_cortext_networks(networks)
with open(os.path.join(outpur_dir, "keywords.cortext_with_year.csv"), "w") as f:
f.write(cortext_network_to_csv_string(year_cortext_networks))

pd.DataFrame.from_records(top_keywords_year_extend).to_csv(
os.path.join(outpur_dir, "keywords.top.year_extend.csv"), index=False
)

# pnetview txt format
pnetview_text = '\n'.join(
[','.join(uniqify([t.replace(',', '-') for t in item["keyword_tokens"]])) for item in cnki_items]
Expand All @@ -147,7 +161,12 @@ def report_cnki_all(
cnki_items,
*,
outpur_dir,
keywords_top_size=50,
):
pd.DataFrame.from_records(cnki_items).to_csv(os.path.join(outpur_dir, "items.csv"), index=False)
report_cnki_keywords(cnki_items, outpur_dir=outpur_dir)
report_cnki_keywords(
cnki_items,
outpur_dir=outpur_dir,
top_size=keywords_top_size
)
report_cnki_org(cnki_items, outpur_dir=outpur_dir)
61 changes: 60 additions & 1 deletion scilib/corrs/corrs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

from __future__ import unicode_literals, absolute_import, print_function, division
from collections import Counter
import pandas as pd


def get_corrs(tokens_list, top_size=50):
def get_corrs_slow(tokens_list, top_size=50):
counter = Counter([token for tokens in tokens_list for token in tokens])
top_n = [token for token, _ in counter.most_common(top_size)]

Expand All @@ -18,6 +19,24 @@ def get_corrs(tokens_list, top_size=50):
return [counter, corrs]


def get_corrs(tokens_list, top_size=50):
counter = Counter([token for tokens in tokens_list for token in tokens])
top_n = [token for token, _ in counter.most_common(top_size)]

# Initialize a dataframe to store the co-occurrence matrix
co_occurrence_matrix = pd.DataFrame(data=0, index=top_n, columns=top_n)

for tokens in tokens_list:
# Only consider the tokens that are in top_n
tokens = set(tokens) & set(top_n)
for token1 in tokens:
for token2 in tokens:
co_occurrence_matrix.at[token1, token2] += 1

corrs = co_occurrence_matrix.reset_index().values.tolist()
return [counter, corrs]


def corrs_to_csv_string(corrs, name_map=None):
name_map = name_map or (lambda x: x)
lines = []
Expand All @@ -26,6 +45,46 @@ def corrs_to_csv_string(corrs, name_map=None):
return '\n'.join(lines)


def corrs_to_pnetview_json(corrs):
top_value = max([max(row[1:]) for row in corrs])
names = [row[0] for row in corrs]
nodes = []
edges = []
for index, row in enumerate(corrs):
name = row[0]
count = row[index + 1]
node = {
"id": name,
"style": {
"keyshape": {
"size": 100 * count / top_value
},
"label": {
"value": name
}
}
}
nodes.append(node)
for other_name in names[index + 1:]:
value = row[names.index(other_name) + 1]
edge = {
"source": name,
"target": other_name,
"value": value,
"style": {
"keyshape": {
"lineWidth": 20 * value / top_value
}
}
}
edges.append(edge)

return {
"nodes": nodes,
"edges": edges
}


def corrs_to_cortext_network(corrs, header=None, name_map=None):
header = header or ['keyword1', 'keyword2']
name_map = name_map or (lambda x: x)
Expand Down
15 changes: 14 additions & 1 deletion scilib/stata/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import unicode_literals, absolute_import, print_function, division

from itertools import combinations
from .plugin import summary, reg, nbreg, psm, margins, label
from .plugin import summary, reg, nbreg, psm, margins, label, teteffects, did


def use_actions(actions, config_actions):
Expand All @@ -21,6 +21,19 @@ def use_actions(actions, config_actions):
actions.append(margins(action["vars"], title=action.get('title'), xtitle=action.get('xtitle'), ytitle=action.get('ytitle'))) # noqa
elif action['type'] == 'psm':
actions.append(psm(action["treatVar"], action["vars"], action["depVar"]))
elif action['type'] == 'teteffects':
actions.append(teteffects(action["treatVar"], action["vars"], action["depVar"]))
elif action['type'] == 'did':
"""
{
"type": "did",
"time": "time",
"treated": "treated",
"y": "y",
"cov": "x1 x2 x3"
}
"""
actions.append(did(action["time"], action["treated"], action["y"], action["cov"]))
elif action['type'] == 'combinations':
for var_list in combinations(action['vars'].split(), action['count']):
for _action in action['actions']:
Expand Down
22 changes: 22 additions & 0 deletions scilib/stata/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,28 @@ def margins(var_list, title=None, xtitle=None, ytitle=None):
)


def teteffects(var_treat, var_deps, var_result):
return call_batch(
call('* teteffects ipw'),
call(f'teffects ipw ({var_result}) ({var_treat} {var_deps})'),

call('* teteffects aipw'),
call(f'teffects aipw ({var_result}) ({var_treat} {var_deps})'),
)


def did(time, treated, y, cov=''):
return call_batch(
call('* DID 分析'),
call(f'reg {y} {time}##{treated} {cov}, r'),
call(f'diff {y}, t({treated}) p({time}) {f"cov({cov})" if cov else ""}'),
call(f'diff {y}, t({treated}) p({time}) {f"cov({cov})" if cov else ""} test'),
call(f'collapse (mean) {y}, by({time} {treated})'),
call(f'twoway (line {y} {time} if {treated}==1) (line {y} {time} if {treated}==0), legend(label(1 Treated) label(2 Control))'), # noqa
call('graph export did.pdf, replace'),
)


def psm(var_treat, var_deps, var_result, word_file='mytable.docx'):

def graph(suffix):
Expand Down
2 changes: 1 addition & 1 deletion scilib/stata/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
filemode='a',
)

STATA_ENTRY = os.environ.get('STATA_ENTRY', '/Applications/Stata/StataSE.app/Contents/MacOS/StataSE')
STATA_ENTRY = os.environ.get('STATA_ENTRY', '/Applications/Stata/StataMP.app/Contents/MacOS/StataMP')
logger = logging.getLogger('stata')
logger.addHandler(logging.StreamHandler())

Expand Down
Loading

0 comments on commit 5037815

Please sign in to comment.