Skip to content

Commit

Permalink
Improve performance of load_libsvm
Browse files Browse the repository at this point in the history
  • Loading branch information
mwydmuch committed Feb 6, 2021
1 parent 28ad9d2 commit 552b965
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 12 deletions.
1 change: 0 additions & 1 deletion python/tests/test_compare_with_xclib_measures.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
import os
from time import time
import numpy as np
Expand Down
34 changes: 34 additions & 0 deletions python/tests/test_load_libsvm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from time import time
from sklearn.datasets import load_svmlight_file
from napkinxc.datasets import download_dataset, load_libsvm_file
import numpy as np


def test_load_libsvm():

datasets = {
"eurlex-4k": {"file": "data/Eurlex/eurlex_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 5000, "offset": 1}},
"amazonCat-13k": {"file": "data/AmazonCat/amazonCat_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 203882, "offset": 1}},
"wiki10-31k": {"file": "data/Wiki10/wiki10_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 101938, "offset": 1}}
}

for d, v in datasets.items():
download_dataset(d, "train")
print("\n{} time comparison:".format(d))

t_start = time()
sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"])
print("\tsklearn.datasets.load_svmlight_file time: {}s".format(time() - t_start))

t_start = time()
nxc_X, nxc_Y = load_libsvm_file(v["file"])
print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start))

assert np.array_equal(nxc_X.indptr, sk_X.indptr)
assert np.array_equal(nxc_X.indices, sk_X.indices)
assert np.allclose(nxc_X.data, sk_X.data)

assert len(nxc_Y) == len(sk_Y)
for nxc_y, sk_y in zip(nxc_Y, sk_Y):
assert len(nxc_y) == len(sk_y)
assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
19 changes: 8 additions & 11 deletions src/read_data.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2019-2020 by Marek Wydmuch
Copyright (c) 2019-2021 by Marek Wydmuch
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -108,7 +108,6 @@ void readData(SRMatrix<Label>& labels, SRMatrix<Feature>& features, Args& args)
}

// Reads line in LibSvm format label,label,... feature(:value) feature(:value) ...
// TODO: rewrite this using split?
void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Feature>& lFeatures) {
// Trim leading spaces
size_t nextPos, pos = line.find_first_not_of(' ');
Expand All @@ -117,17 +116,15 @@ void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Featur
// Label
if ((pos == 0 || line[pos - 1] == ',') &&
(line[nextPos] == ',' || line[nextPos] == ' ')) // || nextPos == std::string::npos))
lLabels.push_back(std::stoi(line.substr(pos, nextPos - pos)));
lLabels.emplace_back(std::strtol(&line[pos], NULL, 10));

// Feature index
else if (line[pos - 1] == ' ' && line[nextPos] == ':') {
int index = std::stoi(line.substr(pos, nextPos - pos));
lFeatures.push_back({index, 1.0});
}
else if (line[pos - 1] == ' ' && line[nextPos] == ':')
lFeatures.emplace_back(std::strtol(&line[pos], NULL, 10), 1.0);

// Feature value
else if (line[pos - 1] == ':' && (line[nextPos] == ' ' || nextPos == std::string::npos))
lFeatures.back().value = std::stof(line.substr(pos, nextPos - pos));
lFeatures.back().value = std::strtod(&line[pos], NULL);

if (nextPos == std::string::npos) break;
pos = nextPos + 1;
Expand All @@ -136,12 +133,12 @@ void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Featur

void prepareFeaturesVector(std::vector<Feature> &lFeatures, double bias) {
// Add bias feature (bias feature has index 1)
lFeatures.push_back({1, bias});
lFeatures.emplace_back(1, bias);
}

void processFeaturesVector(std::vector<Feature> &lFeatures, bool norm, int hashSize, double featuresThreshold) {
//Shift index by 2 because LibLinear ignore feature 0 and feature 1 is reserved for bias
assert(lFeatures.size() >= 1);
assert(!lFeatures.empty());
shift(lFeatures.begin() + 1, lFeatures.end(), 2);

// Hash features
Expand All @@ -151,7 +148,7 @@ void processFeaturesVector(std::vector<Feature> &lFeatures, bool norm, int hashS
lHashed[hash(lFeatures[j].index) % hashSize] += lFeatures[j].value;

lFeatures.erase (lFeatures.begin() + 1, lFeatures.end()); // Keep bias feature
for (const auto& f : lHashed) lFeatures.push_back({f.first + 2, f.second});
for (const auto& f : lHashed) lFeatures.emplace_back(f.first + 2, f.second);
}

// Norm row
Expand Down

0 comments on commit 552b965

Please sign in to comment.