|
30 | 30 | import java.lang.constant.ConstantDesc;
|
31 | 31 | import java.util.Optional;
|
32 | 32 |
|
| 33 | +import jdk.internal.math.FloatConsts; |
33 | 34 | import jdk.internal.math.FloatingDecimal;
|
34 | 35 | import jdk.internal.math.FloatToDecimal;
|
35 | 36 | import jdk.internal.vm.annotation.IntrinsicCandidate;
|
@@ -975,6 +976,198 @@ public static int floatToIntBits(float value) {
|
975 | 976 | @IntrinsicCandidate
|
976 | 977 | public static native float intBitsToFloat(int bits);
|
977 | 978 |
|
| 979 | + /** |
| 980 | + * {@return the {@code float} value closest to the numerical value |
| 981 | + * of the argument, a floating-point binary16 value encoded in a |
| 982 | + * {@code short}} The conversion is exact; all binary16 values can |
| 983 | + * be exactly represented in {@code float}. |
| 984 | + * |
| 985 | + * Special cases: |
| 986 | + * <ul> |
| 987 | + * <li> If the argument is zero, the result is a zero with the |
| 988 | + * same sign as the argument. |
| 989 | + * <li> If the argument is infinite, the result is an infinity |
| 990 | + * with the same sign as the argument. |
| 991 | + * <li> If the argument is a NaN, the result is a NaN. |
| 992 | + * </ul> |
| 993 | + * |
| 994 | + * <h4><a id=binary16Format>IEEE 754 binary16 format</a></h4> |
| 995 | + * The IEEE 754 standard defines binary16 as a 16-bit format, along |
| 996 | + * with the 32-bit binary32 format (corresponding to the {@code |
| 997 | + * float} type) and the 64-bit binary64 format (corresponding to |
| 998 | + * the {@code double} type). The binary16 format is similar to the |
| 999 | + * other IEEE 754 formats, except smaller, having all the usual |
| 1000 | + * IEEE 754 values such as NaN, signed infinities, signed zeros, |
| 1001 | + * and subnormals. The parameters (JLS {@jls 4.2.3}) for the |
| 1002 | + * binary16 format are N = 11 precision bits, K = 5 exponent bits, |
| 1003 | + * <i>E</i><sub><i>max</i></sub> = 15, and |
| 1004 | + * <i>E</i><sub><i>min</i></sub> = -14. |
| 1005 | + * |
| 1006 | + * @apiNote |
| 1007 | + * This method corresponds to the convertFormat operation defined |
| 1008 | + * in IEEE 754 from the binary16 format to the binary32 format. |
| 1009 | + * The operation of this method is analogous to a primitive |
| 1010 | + * widening conversion (JLS {@jls 5.1.2}). |
| 1011 | + * |
| 1012 | + * @param floatBinary16 the binary16 value to convert to {@code float} |
| 1013 | + * @since 20 |
| 1014 | + */ |
| 1015 | + // @IntrinsicCandidate |
| 1016 | + public static float float16ToFloat(short floatBinary16) { |
| 1017 | + /* |
| 1018 | + * The binary16 format has 1 sign bit, 5 exponent bits, and 10 |
| 1019 | + * significand bits. The exponent bias is 15. |
| 1020 | + */ |
| 1021 | + int bin16arg = (int)floatBinary16; |
| 1022 | + int bin16SignBit = 0x8000 & bin16arg; |
| 1023 | + int bin16ExpBits = 0x7c00 & bin16arg; |
| 1024 | + int bin16SignifBits = 0x03FF & bin16arg; |
| 1025 | + |
| 1026 | + // Shift left difference in the number of significand bits in |
| 1027 | + // the float and binary16 formats |
| 1028 | + final int SIGNIF_SHIFT = (FloatConsts.SIGNIFICAND_WIDTH - 11); |
| 1029 | + |
| 1030 | + float sign = (bin16SignBit != 0) ? -1.0f : 1.0f; |
| 1031 | + |
| 1032 | + // Extract binary16 exponent, remove its bias, add in the bias |
| 1033 | + // of a float exponent and shift to correct bit location |
| 1034 | + // (significand width includes the implicit bit so shift one |
| 1035 | + // less). |
| 1036 | + int bin16Exp = (bin16ExpBits >> 10) - 15; |
| 1037 | + if (bin16Exp == -15) { |
| 1038 | + // For subnormal binary16 values and 0, the numerical |
| 1039 | + // value is 2^24 * the significand as an integer (no |
| 1040 | + // implicit bit). |
| 1041 | + return sign * (0x1p-24f * bin16SignifBits); |
| 1042 | + } else if (bin16Exp == 16) { |
| 1043 | + return (bin16SignifBits == 0) ? |
| 1044 | + sign * Float.POSITIVE_INFINITY : |
| 1045 | + Float.intBitsToFloat((bin16SignBit << 16) | |
| 1046 | + 0x7f80_0000 | |
| 1047 | + // Preserve NaN signif bits |
| 1048 | + ( bin16SignifBits << SIGNIF_SHIFT )); |
| 1049 | + } |
| 1050 | + |
| 1051 | + assert -15 < bin16Exp && bin16Exp < 16; |
| 1052 | + |
| 1053 | + int floatExpBits = (bin16Exp + FloatConsts.EXP_BIAS) |
| 1054 | + << (FloatConsts.SIGNIFICAND_WIDTH - 1); |
| 1055 | + |
| 1056 | + // Compute and combine result sign, exponent, and significand bits. |
| 1057 | + return Float.intBitsToFloat((bin16SignBit << 16) | |
| 1058 | + floatExpBits | |
| 1059 | + (bin16SignifBits << SIGNIF_SHIFT)); |
| 1060 | + } |
| 1061 | + |
| 1062 | + /** |
| 1063 | + * {@return the floating-point binary16 value, encoded in a {@code |
| 1064 | + * short}, closest in value to the argument} |
| 1065 | + * The conversion is computed under the {@linkplain |
| 1066 | + * java.math.RoundingMode#HALF_EVEN round to nearest even rounding |
| 1067 | + * mode}. |
| 1068 | + * |
| 1069 | + * Special cases: |
| 1070 | + * <ul> |
| 1071 | + * <li> If the argument is zero, the result is a zero with the |
| 1072 | + * same sign as the argument. |
| 1073 | + * <li> If the argument is infinite, the result is an infinity |
| 1074 | + * with the same sign as the argument. |
| 1075 | + * <li> If the argument is a NaN, the result is a NaN. |
| 1076 | + * </ul> |
| 1077 | + * |
| 1078 | + * The <a href="#binary16Format">binary16 format</a> is discussed in |
| 1079 | + * more detail in the {@link #float16ToFloat} method. |
| 1080 | + * |
| 1081 | + * @apiNote |
| 1082 | + * This method corresponds to the convertFormat operation defined |
| 1083 | + * in IEEE 754 from the binary32 format to the binary16 format. |
| 1084 | + * The operation of this method is analogous to a primitive |
| 1085 | + * narrowing conversion (JLS {@jls 5.1.3}). |
| 1086 | + * |
| 1087 | + * @param f the {@code float} value to convert to binary16 |
| 1088 | + * @since 20 |
| 1089 | + */ |
| 1090 | + // @IntrinsicCandidate |
| 1091 | + public static short floatToFloat16(float f) { |
| 1092 | + int doppel = Float.floatToRawIntBits(f); |
| 1093 | + short sign_bit = (short)((doppel & 0x8000_0000) >> 16); |
| 1094 | + |
| 1095 | + if (Float.isNaN(f)) { |
| 1096 | + // Preserve sign and attempt to preserve significand bits |
| 1097 | + return (short)(sign_bit |
| 1098 | + | 0x7c00 // max exponent + 1 |
| 1099 | + // Preserve high order bit of float NaN in the |
| 1100 | + // binary16 result NaN (tenth bit); OR in remaining |
| 1101 | + // bits into lower 9 bits of binary 16 significand. |
| 1102 | + | (doppel & 0x007f_e000) >> 13 // 10 bits |
| 1103 | + | (doppel & 0x0000_1ff0) >> 4 // 9 bits |
| 1104 | + | (doppel & 0x0000_000f)); // 4 bits |
| 1105 | + } |
| 1106 | + |
| 1107 | + float abs_f = Math.abs(f); |
| 1108 | + |
| 1109 | + // The overflow threshold is binary16 MAX_VALUE + 1/2 ulp |
| 1110 | + if (abs_f >= (0x1.ffcp15f + 0x0.002p15f) ) { |
| 1111 | + return (short)(sign_bit | 0x7c00); // Positive or negative infinity |
| 1112 | + } |
| 1113 | + |
| 1114 | + // Smallest magnitude nonzero representable binary16 value |
| 1115 | + // is equal to 0x1.0p-24; half-way and smaller rounds to zero. |
| 1116 | + if (abs_f <= 0x1.0p-24f * 0.5f) { // Covers float zeros and subnormals. |
| 1117 | + return sign_bit; // Positive or negative zero |
| 1118 | + } |
| 1119 | + |
| 1120 | + // Dealing with finite values in exponent range of binary16 |
| 1121 | + // (when rounding is done, could still round up) |
| 1122 | + int exp = Math.getExponent(f); |
| 1123 | + assert -25 <= exp && exp <= 15; |
| 1124 | + |
| 1125 | + // For binary16 subnormals, beside forcing exp to -15, retain |
| 1126 | + // the difference expdelta = E_min - exp. This is the excess |
| 1127 | + // shift value, in addition to 13, to be used in the |
| 1128 | + // computations below. Further the (hidden) msb with value 1 |
| 1129 | + // in f must be involved as well. |
| 1130 | + int expdelta = 0; |
| 1131 | + int msb = 0x0000_0000; |
| 1132 | + if (exp < -14) { |
| 1133 | + expdelta = -14 - exp; |
| 1134 | + exp = -15; |
| 1135 | + msb = 0x0080_0000; |
| 1136 | + } |
| 1137 | + int f_signif_bits = doppel & 0x007f_ffff | msb; |
| 1138 | + |
| 1139 | + // Significand bits as if using rounding to zero (truncation). |
| 1140 | + short signif_bits = (short)(f_signif_bits >> (13 + expdelta)); |
| 1141 | + |
| 1142 | + // For round to nearest even, determining whether or not to |
| 1143 | + // round up (in magnitude) is a function of the least |
| 1144 | + // significant bit (LSB), the next bit position (the round |
| 1145 | + // position), and the sticky bit (whether there are any |
| 1146 | + // nonzero bits in the exact result to the right of the round |
| 1147 | + // digit). An increment occurs in three cases: |
| 1148 | + // |
| 1149 | + // LSB Round Sticky |
| 1150 | + // 0 1 1 |
| 1151 | + // 1 1 0 |
| 1152 | + // 1 1 1 |
| 1153 | + // See "Computer Arithmetic Algorithms," Koren, Table 4.9 |
| 1154 | + |
| 1155 | + int lsb = f_signif_bits & (1 << 13 + expdelta); |
| 1156 | + int round = f_signif_bits & (1 << 12 + expdelta); |
| 1157 | + int sticky = f_signif_bits & ((1 << 12 + expdelta) - 1); |
| 1158 | + |
| 1159 | + if (round != 0 && ((lsb | sticky) != 0 )) { |
| 1160 | + signif_bits++; |
| 1161 | + } |
| 1162 | + |
| 1163 | + // No bits set in significand beyond the *first* exponent bit, |
| 1164 | + // not just the sigificand; quantity is added to the exponent |
| 1165 | + // to implement a carry out from rounding the significand. |
| 1166 | + assert (0xf800 & signif_bits) == 0x0; |
| 1167 | + |
| 1168 | + return (short)(sign_bit | ( ((exp + 15) << 10) + signif_bits ) ); |
| 1169 | + } |
| 1170 | + |
978 | 1171 | /**
|
979 | 1172 | * Compares two {@code Float} objects numerically.
|
980 | 1173 | *
|
|
0 commit comments