Skip to content

Commit

Permalink
Fix the build breaks the release pipeline and some C++ warnings (#372)
Browse files Browse the repository at this point in the history
* fix the break in release pipeline

* code cleanup and the warnings fixing.

* Update ci.yml for Azure Pipelines

* Update ci.yml for Azure Pipelines

* fix linux build

* one more fixing

* again?

* fixing for macOS
  • Loading branch information
wenbingl committed Mar 1, 2023
1 parent 2521dab commit 0a9a3a9
Show file tree
Hide file tree
Showing 28 changed files with 123 additions and 267 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ out/
.scb/
onnxruntime_extensions/_version.py
onnxruntime-*-*-*/
temp_*.onnx
temp_*onnx*
# Java specific ignores
*/.gradle
java/hs_*.log
Expand Down
16 changes: 16 additions & 0 deletions .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -478,3 +478,19 @@ jobs:
echo "Exception propogation was not enabled correctly."
exit 1
fi
##############################
# Linux for selected_ops build
##############################
- job: Linux_SelectedOpsBuild
pool:
vmImage: 'ubuntu-latest'

steps:
# compiled as only one operator selected.
- bash: |
set -e -x -u
echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake
./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON
displayName: Build ort-extensions with only one operator was selected
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ endif()

if(NOT OCOS_ENABLE_CPP_EXCEPTIONS)
add_compile_definitions(OCOS_NO_EXCEPTIONS ORT_NO_EXCEPTIONS)
if (NOT _ONNXRUNTIME_EMBEDDED)
add_compile_definitions(_HAS_EXCEPTIONS=0)
endif()
endif()

include(FetchContent)
Expand Down Expand Up @@ -254,7 +257,7 @@ if(OCOS_ENABLE_RE2_REGEX)
endif()

# ### scan all source files
set(TARGET_SRC_NOEXCEPTION)
file(GLOB TARGET_SRC_NOEXCEPTION "base/*.h" "base/*.cc")
file(GLOB TARGET_SRC "operators/*.cc" "operators/*.h" "includes/*.h*")

if(OCOS_ENABLE_TF_STRING)
Expand Down Expand Up @@ -402,11 +405,13 @@ standardize_output_folder(ocos_operators)
target_include_directories(noexcep_operators PUBLIC
${ONNXRUNTIME_INCLUDE_DIR}
${PROJECT_SOURCE_DIR}/includes
${PROJECT_SOURCE_DIR}/base
${PROJECT_SOURCE_DIR}/operators)

target_include_directories(ocos_operators PUBLIC
${ONNXRUNTIME_INCLUDE_DIR}
${PROJECT_SOURCE_DIR}/includes
${PROJECT_SOURCE_DIR}/base
${PROJECT_SOURCE_DIR}/operators
${PROJECT_SOURCE_DIR}/operators/tokenizer)

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion operators/string_tensor.cc → base/string_tensor.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "string_tensor.h"
#include "string_utils.h"
#include "ustring.h"
#include "string_tensor.h"

void GetTensorMutableDataString(const OrtApi& api, OrtW::CustomOpApi& ort, OrtKernelContext* context,
const OrtValue* value, std::vector<std::string>& output) {
Expand Down
2 changes: 1 addition & 1 deletion operators/string_tensor.h → base/string_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#pragma once

#include <string>
#include "ocos.h"
#include <string>


// Retrieves a vector of strings if the input type is std::string.
Expand Down
File renamed without changes.
File renamed without changes.
3 changes: 2 additions & 1 deletion operators/ustring.cc → base/ustring.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <iostream>
#include "ustring.h"
#include <iostream>


ustring::ustring() : std::u32string() {
}
Expand Down
9 changes: 3 additions & 6 deletions operators/ustring.h → base/ustring.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <string>
#include <locale>
#include <functional>

#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 1
#include <codecvt>

#include "ocos.h"
#include <locale>
#include <codecvt>

// ustring needs a new implementation, due to the std::codecvt deprecation.
// Wrap u32string with ustring, in case we will use other implementation in the future
class ustring : public std::u32string {
public:
Expand Down
2 changes: 2 additions & 0 deletions includes/ocos.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#pragma once

#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#include <string>
#include <algorithm>
#include <functional>
#include <iterator>
Expand Down
3 changes: 1 addition & 2 deletions operators/tokenizer/basic_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "string_utils.h"
#include "basic_tokenizer.hpp"
#include "string_utils.h"
#include "string_tensor.h"
#include <vector>
#include <locale>
#include <codecvt>
#include <algorithm>

BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents,
Expand Down
5 changes: 3 additions & 2 deletions operators/tokenizer/bert_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@

#pragma once

#include <unordered_map>
#include <vector>
#include "ocos.h"
#include "ustring.h"
#include "string_utils.h"
#include "string_tensor.h"
#include "basic_tokenizer.hpp"

#include <unordered_map>


class BertTokenizerVocab final {
public:
explicit BertTokenizerVocab(std::string_view vocab);
Expand Down
2 changes: 0 additions & 2 deletions operators/tokenizer/bert_tokenizer_decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

#pragma once

#include <unordered_map>
#include <vector>
#include "ocos.h"
#include "ustring.h"
#include "string_utils.h"
Expand Down
65 changes: 47 additions & 18 deletions operators/tokenizer/bpetokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,28 +1,58 @@
// Licensed under the MIT License.
// Partial code comes from other Microsoft employee.
#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <list>
#include <memory>
#include "ocos.h"
#include "ustring.h"

#include <regex>
#include <sstream>
#include <stdexcept>
#include <list>
#include <unordered_map>
#include <functional>
#include <codecvt>
#include <mutex>

#include "unicode.h"
#include "nlohmann/json.hpp"
#include "clip_tokenizer.hpp"
#include "gpt2_tokenizer.hpp"
#include "roberta_tokenizer.hpp"
#include "string_utils.h"
#include "string_tensor.h"
#include "unicode.h"

// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
inline bool IsUnicodeSpace(char32_t ch) {
switch (ch) {
case 0x0009:
case 0x000A:
case 0x000B:
case 0x000C:
case 0x000D:
case 0x001C:
case 0x001D:
case 0x001E:
case 0x001F:
case 0x0020:
case 0x0085:
case 0x00A0:
case 0x1680:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200A:
case 0x2028:
case 0x2029:
case 0x202F:
case 0x205F:
case 0x3000:
return true;
}
return false;
}

inline bool IsEmptyUString(const ustring& str) {
return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
}

class SpecialTokenMap {
public:
Expand Down Expand Up @@ -117,7 +147,6 @@ class VocabData {
} else {
int id = static_cast<int>(vocab_map_.size());
vocab_map_[unk_token] = id;
std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;
}

std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;
Expand Down
66 changes: 3 additions & 63 deletions operators/tokenizer/clip_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -1,68 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// Partial code comes from other Microsoft employee.

#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <list>
#include <memory>
#include <regex>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
#include <functional>
#include <codecvt>
#include <mutex>

#include "nlohmann/json.hpp"
#include "bpetokenizer.hpp"
#include "string_tensor.h"
#include "unicode.h"

// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
bool IsInUnicodeSpace(char32_t ch) {
switch (ch) {
case 0x0009:
case 0x000A:
case 0x000B:
case 0x000C:
case 0x000D:
case 0x001C:
case 0x001D:
case 0x001E:
case 0x001F:
case 0x0020:
case 0x0085:
case 0x00A0:
case 0x1680:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200A:
case 0x2028:
case 0x2029:
case 0x202F:
case 0x205F:
case 0x3000:
return true;
}
return false;
}

bool IsEmptyUstring(const ustring& str) {
return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsInUnicodeSpace(ch); });
}
#include "clip_tokenizer.hpp"
#include "string_utils.h"

KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
: BaseKernel(api, info) {
Expand Down Expand Up @@ -93,7 +33,7 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne
std::vector<int64_t> KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length) {
std::vector<int64_t> res;

if (IsEmptyUstring(input)) {
if (IsEmptyUString(input)) {
return res;
}
// Add <|startoftext|> token to result
Expand Down
9 changes: 4 additions & 5 deletions operators/tokenizer/clip_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#include <list>
#include "ocos.h"
#include "ustring.h"
#include "string_utils.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

class VocabData;
#pragma once
#include "bpetokenizer.hpp"

struct KernelClipBpeTokenizer : BaseKernel {
KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info);
Expand Down

0 comments on commit 0a9a3a9

Please sign in to comment.