In [8]:
import numpy as np
import torch
import random

from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import warnings
warnings.filterwarnings("ignore")

In [7]:
#!/usr/bin/env python3
"""
Script to download the Cora dataset from available sources
"""
import os
import requests
import zipfile
import tarfile
import shutil
from pathlib import Path

def download_file(url, filename):
    """Download a file from URL"""
    print(f"Downloading from {url}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()

    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {filename}")

def setup_cora_planetoid():
    """Download Cora from Planetoid (primary source)"""
    print("\nAttempting to download Cora dataset from Planetoid format...")

    # Create directories
    os.makedirs("datasets/cora_orig", exist_ok=True)

    try:
        # Download from GitHub mirror of Planetoid datasets
        base_url = "https://raw.githubusercontent.com/kimiyoung/planetoid/master/data"

        files_to_download = [
            ("ind.cora.x", "cora.x"),
            ("ind.cora.y", "cora.y"),
            ("ind.cora.tx", "cora.tx"),
            ("ind.cora.ty", "cora.ty"),
            ("ind.cora.allx", "cora.allx"),
            ("ind.cora.ally", "cora.ally"),
            ("ind.cora.graph", "cora.graph"),
            ("ind.cora.test.index", "cora.test.index")
        ]

        temp_dir = "temp_cora_download"
        os.makedirs(temp_dir, exist_ok=True)

        # Download all files
        for remote_file, local_file in files_to_download:
            url = f"{base_url}/{remote_file}"
            download_file(url, os.path.join(temp_dir, local_file))

        print("\nConverting Planetoid format to standard Cora format...")

        # Convert to standard format
        convert_planetoid_to_standard(temp_dir, "datasets/cora_orig")

        # Clean up
        shutil.rmtree(temp_dir)

        print("✓ Successfully downloaded and converted Cora dataset!")
        return True

    except Exception as e:
        print(f"Failed to download from Planetoid: {e}")
        return False

def convert_planetoid_to_standard(input_dir, output_dir):
    """Convert Planetoid format to standard Cora format"""
    import pickle
    import numpy as np

    # Load data
    with open(os.path.join(input_dir, "cora.x"), 'rb') as f:
        x = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.y"), 'rb') as f:
        y = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.tx"), 'rb') as f:
        tx = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.ty"), 'rb') as f:
        ty = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.allx"), 'rb') as f:
        allx = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.ally"), 'rb') as f:
        ally = pickle.load(f, encoding='latin1')
    with open(os.path.join(input_dir, "cora.graph"), 'rb') as f:
        graph = pickle.load(f, encoding='latin1')

    # Read test indices
    test_idx_reorder = []
    with open(os.path.join(input_dir, "cora.test.index"), 'r') as f:
        for line in f:
            test_idx_reorder.append(int(line.strip()))

    # Process features
    features = np.vstack((allx, tx))
    features[test_idx_reorder, :] = features[len(allx):, :]
    features = features[:len(allx), :]

    # Process labels
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[len(ally):, :]
    labels = labels[:len(ally), :]
    labels = np.argmax(labels, axis=1)

    # Class names
    class_names = ['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
                   'Probabilistic_Methods', 'Reinforcement_Learning',
                   'Rule_Learning', 'Theory']

    # Create content file
    content_file = os.path.join(output_dir, "cora.content")
    with open(content_file, 'w') as f:
        for i in range(features.shape[0]):
            # Node ID
            f.write(f"{i}\t")
            # Features
            feature_str = '\t'.join([str(int(x)) for x in features[i]])
            f.write(f"{feature_str}\t")
            # Class label
            f.write(f"{class_names[labels[i]]}\n")

    # Create cites file
    cites_file = os.path.join(output_dir, "cora.cites")
    edges = set()
    for node, neighbors in graph.items():
        for neighbor in neighbors:
            edges.add((node, neighbor))

    with open(cites_file, 'w') as f:
        for src, dst in sorted(edges):
            f.write(f"{src}\t{dst}\n")

    print(f"Created {content_file} and {cites_file}")

def download_cora_alternative():
    """Alternative: Download from other sources"""
    print("\nTrying alternative download source...")

    try:
        # Alternative URL (from LINQS)
        url = "https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz"

        os.makedirs("datasets", exist_ok=True)
        temp_file = "cora.tgz"

        download_file(url, temp_file)

        # Extract
        print("Extracting files...")
        with tarfile.open(temp_file, 'r:gz') as tar:
            tar.extractall("datasets")

        # Rename directory if needed
        if os.path.exists("datasets/cora"):
            os.rename("datasets/cora", "datasets/cora_orig")

        # Clean up
        os.remove(temp_file)

        print("✓ Successfully downloaded Cora dataset from alternative source!")
        return True

    except Exception as e:
        print(f"Failed to download from alternative source: {e}")
        return False

def verify_dataset():
    """Verify that the dataset was downloaded correctly"""
    required_files = [
        "datasets/cora_orig/cora.content",
        "datasets/cora_orig/cora.cites"
    ]

    all_present = True
    for file in required_files:
        if os.path.exists(file):
            print(f"✓ {file} exists")
        else:
            print(f"✗ {file} is missing")
            all_present = False

    if all_present:
        # Check file contents
        with open("datasets/cora_orig/cora.content", 'r') as f:
            num_nodes = sum(1 for _ in f)
        with open("datasets/cora_orig/cora.cites", 'r') as f:
            num_edges = sum(1 for _ in f)

        print(f"\nDataset statistics:")
        print(f"- Number of nodes: {num_nodes}")
        print(f"- Number of edges: {num_edges}")

    return all_present

def main():
    print("Cora Dataset Downloader")
    print("=" * 50)

    # Check if dataset already exists
    if os.path.exists("datasets/cora_orig/cora.content") and \
       os.path.exists("datasets/cora_orig/cora.cites"):
        print("Dataset already exists!")
        verify_dataset()
        return

    # Try downloading from different sources
    success = setup_cora_planetoid()

    if not success:
        success = download_cora_alternative()

    if success:
        print("\n" + "=" * 50)
        print("Verifying dataset...")
        verify_dataset()
    else:
        print("\n" + "=" * 50)
        print("Failed to download Cora dataset from all sources.")
        print("\nManual download instructions:")
        print("1. Visit: https://linqs.soe.ucsc.edu/data")
        print("2. Download the Cora dataset")
        print("3. Extract to ./datasets/cora_orig/")
        print("4. Ensure you have cora.content and cora.cites files")

if __name__ == "__main__":
    main()

Cora Dataset Downloader

Attempting to download Cora dataset from Planetoid format...
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.x...
Downloaded temp_cora_download/cora.x
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.y...
Downloaded temp_cora_download/cora.y
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.tx...
Downloaded temp_cora_download/cora.tx
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.ty...
Downloaded temp_cora_download/cora.ty
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.allx...
Downloaded temp_cora_download/cora.allx
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.ally...
Downloaded temp_cora_download/cora.ally
Downloading from https://raw.githubusercontent.com/kimiyoung/planetoid/master/data/ind.cora.gr

  x = pickle.load(f, encoding='latin1')
  tx = pickle.load(f, encoding='latin1')
  allx = pickle.load(f, encoding='latin1')


Downloaded cora.tgz
Extracting files...
✓ Successfully downloaded Cora dataset from alternative source!

Verifying dataset...
✓ datasets/cora_orig/cora.content exists
✓ datasets/cora_orig/cora.cites exists

Dataset statistics:
- Number of nodes: 2708
- Number of edges: 5429


In [3]:
torch.cuda.is_available()

True

In [4]:
dataset = Planetoid('./datasets', 'cora',
                        transform=T.NormalizeFeatures())
data = dataset[0]

In [5]:
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [6]:
def parse_cora():
    path = './datasets/cora_orig/cora'
    idx_features_labels = np.genfromtxt(
        "{}.content".format(path), dtype=np.dtype(str))
    data_X = idx_features_labels[:, 1:-1].astype(np.float32)
    labels = idx_features_labels[:, -1]
    class_map = {x: i for i, x in enumerate(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
                                            'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory'])}
    data_Y = np.array([class_map[l] for l in labels])
    data_citeid = idx_features_labels[:, 0]
    idx = np.array(data_citeid, dtype=np.dtype(str))
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt(
        "{}.cites".format(path), dtype=np.dtype(str))
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape(
        edges_unordered.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype='int')
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    return data_X, data_Y, data_citeid, np.unique(data_edges, axis=0).transpose()

parse_cora()


FileNotFoundError: ./datasets/cora_orig/cora.content not found.

In [4]:



# return cora dataset as pytorch geometric Data object together with 60/20/20 split, and list of cora IDs


def get_cora_casestudy(SEED=0):
    data_X, data_Y, data_citeid, data_edges = parse_cora()
    # data_X = sklearn.preprocessing.normalize(data_X, norm="l1")

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data_name = 'cora'
    # path = osp.join(osp.dirname(osp.realpath(__file__)), 'dataset')
    dataset = Planetoid('./datasets', data_name,
                        transform=T.NormalizeFeatures())
    data = dataset[0]

    data.x = torch.tensor(data_X).float()
    data.edge_index = torch.tensor(data_edges).long()
    data.y = torch.tensor(data_Y).long()
    data.num_nodes = len(data_Y)

    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    # data.train_id = np.sort(node_id[:int(data.num_nodes * 0.1)])
    # data.val_id = np.sort(
    #     node_id[int(data.num_nodes * 0.1):int(data.num_nodes * 0.2)])
    # data.test_id = np.sort(node_id[int(data.num_nodes * 0.2):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])

    return data, data_citeid

# credit: https://github.com/tkipf/pygcn/issues/27, xuhaiyun


def parse_cora():
    path = './datasets/cora_orig/cora'
    idx_features_labels = np.genfromtxt(
        "{}.content".format(path), dtype=np.dtype(str))
    data_X = idx_features_labels[:, 1:-1].astype(np.float32)
    labels = idx_features_labels[:, -1]
    class_map = {x: i for i, x in enumerate(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
                                            'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory'])}
    data_Y = np.array([class_map[l] for l in labels])
    data_citeid = idx_features_labels[:, 0]
    idx = np.array(data_citeid, dtype=np.dtype(str))
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt(
        "{}.cites".format(path), dtype=np.dtype(str))
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape(
        edges_unordered.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype='int')
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    return data_X, data_Y, data_citeid, np.unique(data_edges, axis=0).transpose()


def get_raw_text_cora(use_text=False, seed=0):
    data, data_citeid = get_cora_casestudy(seed)
    if not use_text:
        return data, None

    with open('./datasets/cora_orig/mccallum/cora/papers')as f:
        lines = f.readlines()
    pid_filename = {}
    for line in lines:
        pid = line.split('\t')[0]
        fn = line.split('\t')[1]
        pid_filename[pid] = fn

    path = './datasets/cora_orig/mccallum/cora/extractions/'
    text = []
    for pid in data_citeid:
        fn = pid_filename[pid]
        with open(path+fn) as f:
            lines = f.read().splitlines()

        for line in lines:
            if 'Title:' in line:
                ti = line
            if 'Abstract:' in line:
                ab = line
        text.append(ti+'\n'+ab)
    return data, text


FileNotFoundError: ./datasets/cora_orig/cora.content not found.