Improve performance of load_libsvm

mwydmuch · Feb 6, 2021 · 552b965 · 552b965
1 parent 28ad9d2
commit 552b965
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 12 deletions.
diff --git a/python/tests/test_compare_with_xclib_measures.py b/python/tests/test_compare_with_xclib_measures.py
@@ -1,4 +1,3 @@
-import sys
 import os
 from time import time
 import numpy as np

diff --git a/python/tests/test_load_libsvm.py b/python/tests/test_load_libsvm.py
@@ -0,0 +1,34 @@
+from time import time
+from sklearn.datasets import load_svmlight_file
+from napkinxc.datasets import download_dataset, load_libsvm_file
+import numpy as np
+
+
+def test_load_libsvm():
+
+    datasets = {
+        "eurlex-4k": {"file": "data/Eurlex/eurlex_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 5000, "offset": 1}},
+        "amazonCat-13k": {"file": "data/AmazonCat/amazonCat_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 203882, "offset": 1}},
+        "wiki10-31k": {"file": "data/Wiki10/wiki10_train.txt", "sklearn_args": {"multilabel": True, "zero_based": True, "n_features": 101938, "offset": 1}}
+    }
+
+    for d, v in datasets.items():
+        download_dataset(d, "train")
+        print("\n{} time comparison:".format(d))
+
+        t_start = time()
+        sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"])
+        print("\tsklearn.datasets.load_svmlight_file time: {}s".format(time() - t_start))
+
+        t_start = time()
+        nxc_X, nxc_Y = load_libsvm_file(v["file"])
+        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start))
+
+        assert np.array_equal(nxc_X.indptr, sk_X.indptr)
+        assert np.array_equal(nxc_X.indices, sk_X.indices)
+        assert np.allclose(nxc_X.data, sk_X.data)
+
+        assert len(nxc_Y) == len(sk_Y)
+        for nxc_y, sk_y in zip(nxc_Y, sk_Y):
+            assert len(nxc_y) == len(sk_y)
+            assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
diff --git a/src/read_data.cpp b/src/read_data.cpp
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2019-2020 by Marek Wydmuch
+ Copyright (c) 2019-2021 by Marek Wydmuch
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -108,7 +108,6 @@ void readData(SRMatrix<Label>& labels, SRMatrix<Feature>& features, Args& args)
 }
 
 // Reads line in LibSvm format label,label,... feature(:value) feature(:value) ...
-// TODO: rewrite this using split?
 void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Feature>& lFeatures) {
     // Trim leading spaces
     size_t nextPos, pos = line.find_first_not_of(' ');
@@ -117,17 +116,15 @@ void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Featur
         // Label
         if ((pos == 0 || line[pos - 1] == ',') &&
             (line[nextPos] == ',' || line[nextPos] == ' ')) // || nextPos == std::string::npos))
-            lLabels.push_back(std::stoi(line.substr(pos, nextPos - pos)));
+            lLabels.emplace_back(std::strtol(&line[pos], NULL, 10));
 
         // Feature index
-        else if (line[pos - 1] == ' ' && line[nextPos] == ':') {
-            int index = std::stoi(line.substr(pos, nextPos - pos));
-            lFeatures.push_back({index, 1.0});
-        }
+        else if (line[pos - 1] == ' ' && line[nextPos] == ':')
+            lFeatures.emplace_back(std::strtol(&line[pos], NULL, 10), 1.0);
 
         // Feature value
         else if (line[pos - 1] == ':' && (line[nextPos] == ' ' || nextPos == std::string::npos))
-            lFeatures.back().value = std::stof(line.substr(pos, nextPos - pos));
+            lFeatures.back().value = std::strtod(&line[pos], NULL);
 
         if (nextPos == std::string::npos) break;
         pos = nextPos + 1;
@@ -136,12 +133,12 @@ void readLine(std::string& line, std::vector<Label>& lLabels, std::vector<Featur
 
 void prepareFeaturesVector(std::vector<Feature> &lFeatures, double bias) {
     // Add bias feature (bias feature has index 1)
-    lFeatures.push_back({1, bias});
+    lFeatures.emplace_back(1, bias);
 }
 
 void processFeaturesVector(std::vector<Feature> &lFeatures, bool norm, int hashSize, double featuresThreshold) {
     //Shift index by 2 because LibLinear ignore feature 0 and feature 1 is reserved for bias
-    assert(lFeatures.size() >= 1);
+    assert(!lFeatures.empty());
     shift(lFeatures.begin() + 1, lFeatures.end(), 2);
 
     // Hash features
@@ -151,7 +148,7 @@ void processFeaturesVector(std::vector<Feature> &lFeatures, bool norm, int hashS
             lHashed[hash(lFeatures[j].index) % hashSize] += lFeatures[j].value;
 
         lFeatures.erase (lFeatures.begin() + 1, lFeatures.end()); // Keep bias feature
-        for (const auto& f : lHashed) lFeatures.push_back({f.first + 2, f.second});
+        for (const auto& f : lHashed) lFeatures.emplace_back(f.first + 2, f.second);
     }
 
     // Norm row