# Reorganize files

I ran this with the notebook in the neurosynth_folder, then moved it after. That's why all of the paths are relative.

Also, I skipped version 0.2 because the files were formatted differently and I didn't think it was worth the effort to reorganize them.

In [1]:
import os.path as op

import numpy as np
import pandas as pd

from scipy import sparse

In [2]:
database_files = {
    "current_data/database.txt": "data-neurosynth_version-7_database.tsv.gz",
    "data_0.6.July_2015/database.txt": "data-neurosynth_version-6_database.tsv.gz",
    "data_0.5.February_2015/database.txt": "data-neurosynth_version-5_database.tsv.gz",
    "data_0.4.September_2014/database.txt": "data-neurosynth_version-4_database.tsv.gz",
    "data_0.3.April_2014/database.txt": "data-neurosynth_version-3_database.tsv.gz",
    # "data_0.2.May_2013/database.txt": "data-neurosynth_version-2_database.tsv.gz",
}
feature_files = {
    "current_data/features.txt": {
        "database": "data-neurosynth_version-7_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-terms_version-7_vocabulary.txt",
        "ids": "data-neurosynth_version-7_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-7_features.npz",
    },
    "data_0.6.July_2015/features.txt": {
        "database": "data-neurosynth_version-6_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-terms_version-6_vocabulary.txt",
        "ids": "data-neurosynth_version-6_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-6_features.npz",
    },
    "data_0.5.February_2015/features.txt": {
        "database": "data-neurosynth_version-5_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-terms_version-5_vocabulary.txt",
        "ids": "data-neurosynth_version-5_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-5_features.npz",
    },
    "data_0.4.September_2014/features.txt": {
        "database": "data-neurosynth_version-4_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-terms_version-4_vocabulary.txt",
        "ids": "data-neurosynth_version-4_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-4_features.npz",
    },
    "data_0.3.April_2014/features.txt": {
        "database": "data-neurosynth_version-3_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-terms_version-3_vocabulary.txt",
        "ids": "data-neurosynth_version-3_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-3_features.npz",
    },
    # "data_0.2.May_2013/features.txt": {
    #     "database": "data-neurosynth_version-2_database.tsv.gz",
    #     "vocab": "data-neurosynth_vocab-terms_version-2_vocabulary.tsv",
    #     "ids": "data-neurosynth_version-2_ids.tsv",
    #     "features": "data-neurosynth_source-abstract_vocab-terms_type-tfidf_version-2_features.npz",
    # },
}
topic_feature_files = {
    "topics/v5-topics/analyses/v5-topics-50.txt": {
        "database": "data-neurosynth_version-7_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA50_version-7_vocabulary.txt",
        "ids": "data-neurosynth_version-7_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA50_type-weight_version-7_features.npz",
    },
    "topics/v5-topics/analyses/v5-topics-100.txt": {
        "database": "data-neurosynth_version-7_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA100_version-7_vocabulary.txt",
        "ids": "data-neurosynth_version-7_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA100_type-weight_version-7_features.npz",
    },
    "topics/v5-topics/analyses/v5-topics-200.txt": {
        "database": "data-neurosynth_version-7_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA200_version-7_vocabulary.txt",
        "ids": "data-neurosynth_version-7_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA200_type-weight_version-7_features.npz",
    },
    "topics/v5-topics/analyses/v5-topics-400.txt": {
        "database": "data-neurosynth_version-7_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA400_version-7_vocabulary.txt",
        "ids": "data-neurosynth_version-7_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA400_type-weight_version-7_features.npz",
    },
    "topics/v4-topics/analyses/v4-topics-50.txt": {
        "database": "data-neurosynth_version-6_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA50_version-6_vocabulary.txt",
        "ids": "data-neurosynth_version-6_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA50_type-weight_version-6_features.npz",
    },
    "topics/v4-topics/analyses/v4-topics-100.txt": {
        "database": "data-neurosynth_version-6_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA100_version-6_vocabulary.txt",
        "ids": "data-neurosynth_version-6_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA100_type-weight_version-6_features.npz",
    },
    "topics/v4-topics/analyses/v4-topics-200.txt": {
        "database": "data-neurosynth_version-6_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA200_version-6_vocabulary.txt",
        "ids": "data-neurosynth_version-6_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA200_type-weight_version-6_features.npz",
    },
    "topics/v4-topics/analyses/v4-topics-400.txt": {
        "database": "data-neurosynth_version-6_database.tsv.gz",
        "vocab": "data-neurosynth_vocab-LDA400_version-6_vocabulary.txt",
        "ids": "data-neurosynth_version-6_ids.txt",
        "features": "data-neurosynth_source-abstract_vocab-LDA400_type-weight_version-6_features.npz",
    },
}

In [3]:
def reorganize_database_file(in_file, out_file):
    print(f"Processing {in_file}")
    if op.isfile(out_file):
        print("\tSkipping")
        return
    df = pd.read_table(in_file)
    df = df.sort_values(by="id")
    df.to_csv(out_file, sep="\t", line_terminator="\n", index=False)
    
def reorganize_feature_file(in_file, dict_):
    print(f"Processing {in_file}")
    database_file = dict_["database"]
    vocab_file = dict_["vocab"]
    ids_file = dict_["ids"]
    features_file = dict_["features"]
    database = pd.read_table(database_file)
    database_ids = sorted(list(set(database["id"].astype(str))))
    
    if op.isfile(features_file):
        print("\tSkipping")
        return
    
    assert not op.isfile(vocab_file)

    try:
        original_df = pd.read_table(in_file, index_col="pmid")
    except ValueError:
        original_df = pd.read_table(in_file, index_col="id")
    
    original_df.index = original_df.index.astype(str)
    feature_ids = original_df.index.tolist()
    if op.isfile(ids_file):
        ids_file_ids = np.genfromtxt(ids_file, dtype=str).tolist()
    else:
        ids_file_ids = sorted(feature_ids[:])
    
    in_ids_but_not_feature = list(set(ids_file_ids) - set(feature_ids))
    in_feature_but_not_ids = list(set(feature_ids) - set(ids_file_ids))
    in_database_but_not_feature = list(set(database_ids) - set(feature_ids))
    in_feature_but_not_database = list(set(feature_ids) - set(database_ids))
    if in_ids_but_not_feature:
        raise Exception(f"{len(in_ids_but_not_feature)} found in IDs file but not {in_file}")
    
    if in_feature_but_not_ids:
        raise Exception(f"{len(in_feature_but_not_ids)} found in {in_file} but not IDs file")
        
    if in_database_but_not_feature:
        raise Exception(f"{len(in_database_but_not_feature)} found in DB file but not {in_file}")
    
    if in_feature_but_not_database:
        raise Exception(f"{len(in_feature_but_not_database)} found in {in_file} but not DB file")
    
    # Ensure same order
    original_df = original_df.loc[ids_file_ids]
    
    # Now split into data, vocab, and ids
    feature_data = original_df.to_numpy()
    feature_vocab = original_df.columns.tolist()
    feature_ids = original_df.index.tolist()
    
    # Output vocab
    with open(vocab_file, "w") as fo:
        fo.write("\n".join(feature_vocab))
    
    # Convert to Compressed Sparse Column format sparse matrix
    # and save to file
    feature_data_sparse = sparse.csc_matrix(feature_data)
    sparse.save_npz(features_file, feature_data_sparse, compressed=True)
    
    # Output IDS
    if not op.isfile(ids_file):
        with open(ids_file, "w") as fo:
            fo.write("\n".join(feature_ids))

In [4]:
for k, v in database_files.items():
    reorganize_database_file(k, v)

Processing current_data/database.txt
	Skipping
Processing data_0.6.July_2015/database.txt
	Skipping
Processing data_0.5.February_2015/database.txt
	Skipping
Processing data_0.4.September_2014/database.txt
	Skipping
Processing data_0.3.April_2014/database.txt
	Skipping


In [5]:
for k, v in feature_files.items():
    reorganize_feature_file(k, v)

Processing current_data/features.txt
Processing data_0.6.July_2015/features.txt
Processing data_0.5.February_2015/features.txt
Processing data_0.4.September_2014/features.txt
Processing data_0.3.April_2014/features.txt


In [6]:
for k, v in topic_feature_files.items():
    reorganize_feature_file(k, v)

Processing topics/v5-topics/analyses/v5-topics-50.txt
Processing topics/v5-topics/analyses/v5-topics-100.txt
Processing topics/v5-topics/analyses/v5-topics-200.txt
Processing topics/v5-topics/analyses/v5-topics-400.txt
Processing topics/v4-topics/analyses/v4-topics-50.txt
Processing topics/v4-topics/analyses/v4-topics-100.txt
Processing topics/v4-topics/analyses/v4-topics-200.txt
Processing topics/v4-topics/analyses/v4-topics-400.txt
