In [1]:
#include <iostream>
#include <cstdio>
#include <cassert>
#include <cmath>



In [2]:
#define MY_ASSERT(expr) \
  if (!expr) { \
    printf("Assertion failed on line %d: %s\n", __LINE__, #expr); \
    assert(expr); \
  }



In [3]:
#define STRUCT_FLOAT_EXP_BITS 3
#define STRUCT_FLOAT_MANT_BITS 5

struct Float {
  unsigned int sign:1;
  unsigned int exp:STRUCT_FLOAT_EXP_BITS;
  unsigned int mant:STRUCT_FLOAT_MANT_BITS;
};

#define FLOAT_EXP_BIAS ((1 << (STRUCT_FLOAT_EXP_BITS - 1)) - 1)
#define FLOAT_EXP_MAX  ((1 << STRUCT_FLOAT_EXP_BITS) - 1)
#define FLOAT_MANT_MAX ((1 << STRUCT_FLOAT_MANT_BITS) - 1)
const unsigned int Float_exp_bias = FLOAT_EXP_BIAS;
const unsigned int Float_exp_max = FLOAT_EXP_MAX;
const unsigned int Float_mant_max = FLOAT_MANT_MAX;
const Float Float_zero = {0, 0, 0};
const Float Float_minus_zero = {1, 0, 0};
const Float Float_infty = {0, FLOAT_EXP_MAX, 0};
const Float Float_minus_infty = {1, FLOAT_EXP_MAX, 0};
const Float Float_nan = {0, FLOAT_EXP_MAX, 1};
const Float Float_one = {0, FLOAT_EXP_BIAS, 0};
//const Float Float_eps = Float_subtract(Float_one, Float_next(Float_one));



In [4]:
/// Returns true if a represents NaN, or Not-A-Number
bool Float_is_nan(const Float a) {
  return a.exp == Float_exp_max && a.mant != 0;
}

MY_ASSERT(!Float_is_nan(Float_zero));
MY_ASSERT(!Float_is_nan(Float_minus_zero));
MY_ASSERT(!Float_is_nan(Float_infty));
MY_ASSERT(!Float_is_nan(Float_minus_infty));
MY_ASSERT(Float_is_nan(Float_nan));
MY_ASSERT(Float_is_nan({0, FLOAT_EXP_MAX, 3}));



In [5]:
/// Returns true for a == +\infty or a == -\infty
bool Float_is_inf(const Float a) {
  return a.exp == Float_exp_max && a.mant == 0;
}

MY_ASSERT(!Float_is_inf(Float_zero));
MY_ASSERT(!Float_is_inf(Float_minus_zero));
MY_ASSERT(Float_is_inf(Float_infty));
MY_ASSERT(Float_is_inf(Float_minus_infty));
MY_ASSERT(!Float_is_inf(Float_nan));
MY_ASSERT(!Float_is_inf({0, FLOAT_EXP_MAX, 3}));



In [6]:
/// Returns true for a == +\infty
bool Float_is_pinf(const Float a) {
  return a.sign == 0 && Float_is_inf(a);
}

MY_ASSERT(!Float_is_pinf(Float_zero));
MY_ASSERT(!Float_is_pinf(Float_minus_zero));
MY_ASSERT(Float_is_pinf(Float_infty));
MY_ASSERT(!Float_is_pinf(Float_minus_infty));
MY_ASSERT(!Float_is_pinf(Float_nan));
MY_ASSERT(!Float_is_pinf({0, FLOAT_EXP_MAX, 3}));



In [7]:
/// Returns true for a == -\infty
bool Float_is_ninf(const Float a) {
  return a.sign == 1 && Float_is_inf(a);
}

MY_ASSERT(!Float_is_ninf(Float_zero));
MY_ASSERT(!Float_is_ninf(Float_minus_zero));
MY_ASSERT(!Float_is_ninf(Float_infty));
MY_ASSERT(Float_is_ninf(Float_minus_infty));
MY_ASSERT(!Float_is_ninf(Float_nan));
MY_ASSERT(!Float_is_ninf({0, FLOAT_EXP_MAX, 3}));



In [8]:
/// Returns true if a is in the subnormal range
bool Float_is_subnormal(const Float a) {
  return a.exp == 0;
}

MY_ASSERT(Float_is_subnormal(Float_zero));
MY_ASSERT(Float_is_subnormal(Float_minus_zero));
MY_ASSERT(!Float_is_subnormal(Float_infty));
MY_ASSERT(!Float_is_subnormal(Float_minus_infty));
MY_ASSERT(!Float_is_subnormal(Float_nan));
MY_ASSERT(!Float_is_subnormal({0, FLOAT_EXP_MAX, 3}));
MY_ASSERT(!Float_is_subnormal({0, FLOAT_EXP_MAX, 3}));



In [9]:
/// Returns true if and only if the bits match exactly
bool Float_binary_equal(const Float a, const Float b) {
  return a.sign == b.sign && a.exp == b.exp && a.mant == b.mant;
}



In [10]:
/// Returns true for a == b
/// Note: zero == minus zero
/// Note: nan != nan, no matter what
/// Note: +\infty == +\infty and -\infty = -\infty
bool Float_equal(const Float a, const Float b) {
  // first line checks:  zero == minus_zero
  // second line checks: a == b for each field
  // second line: nan != nan, even if bits match
  return (a.exp == 0 && b.exp == 0 && a.mant == 0 && b.mant == 0)
         || (Float_binary_equal(a, b) && !Float_is_nan(a));
}



In [11]:
/// Prints the float to the console
void Float_print(const Float a) {
  printf("Float(%d, %d, %d) = ", a.sign, a.exp, a.mant);
  if (Float_is_nan(a)) {
    fputs("NaN", stdout);
  } else if (Float_is_pinf(a)) {
    fputs("+infty", stdout);
  } else if (Float_is_ninf(a)) {
    fputs("-infty", stdout);
  } else {
    // Only print to 4 decimal places
    unsigned long mant_section = (a.mant * 5000) >> (STRUCT_FLOAT_MANT_BITS - 1);
    int sign = (a.sign == 1 ? -1 : 1);
    float mant_value =
      sign * (Float_is_subnormal(a) ? 0 : 1)
      + sign * ((float)mant_section) / 10000;
    int exponent = a.exp - Float_exp_bias + (Float_is_subnormal(a) ? 1 : 0);
    float value = mant_value *
      (exponent >= 0 ? (1 << exponent) : (1 / ((float)(1 << -exponent))));

    printf("%s%d.%04ld * 2^(%d) = %g",
      a.sign == 1 ? "-" : "",
      !Float_is_subnormal(a),
      mant_section,
      exponent,
      value
      );
  }
}



In [12]:
// Convert Float -> float
float Float_to_float(const Float a) {
  unsigned int mant_section = (a.mant * 500) >> (STRUCT_FLOAT_MANT_BITS - 1);
  int sign = (a.sign == 1 ? -1 : 1);
  float mant_value =
    sign * (Float_is_subnormal(a) ? 0 : 1)
    + sign * ((float)mant_section) / 1000;
  int exponent = a.exp - Float_exp_bias + (Float_is_subnormal(a) ? 1 : 0);
  float value = mant_value *
    (exponent >= 0 ? (1 << exponent) : (1 / ((float)(1 << -exponent))));
  return value;
}



In [13]:
/// Returns the ordinal away from zero in FP steps
int Float_ord(const Float a) {
  int sign = (a.sign == 1 ? -1 : 1);
  int as_int = (a.exp << STRUCT_FLOAT_MANT_BITS) + a.mant;
  return sign * as_int;
}



In [14]:
Float Float_from_ord(const int ord) {
  // TODO: implement
  return (Float){0, 0, 0};
}



In [15]:
/// Gives the next Float value away from zero
Float Float_next(const Float a) {
  if (a.mant == Float_mant_max) {
    if (a.exp != Float_exp_max) {
      return {a.sign, a.exp + 1u, 0};
    }
    // If we are in NaN territory, stay in NaN territory
    return a;
  } else {
    return {a.sign, a.exp, a.mant + 1u};
  }
}



In [16]:
/// Adds two floats and returns the result
Float Float_add(const Float a, const Float b) {
  // TODO: implement
  return Float_zero;
}



In [17]:
Float Float_subtract(const Float a, const Float b) {
  Float neg_b = {b.sign ^ 0x1u, b.exp, b.mant};
  return Float_add(a, neg_b);
}



In [18]:
/// Multiplies two floats and returns the result
Float Float_mult(const Float a, const Float b) {
  // TODO: implement
  return Float_zero;
}



In [19]:
void print_named_Float(const char* name, const Float a) {
  fputs(name, stdout);
  Float_print(a);
  fputs("\n", stdout);
}



In [20]:
fputs("Sign Bits:         1\n", stdout);
printf("Exponent Bits:     %d\n", STRUCT_FLOAT_EXP_BITS);
printf("Mantissa Bits:     %d\n", STRUCT_FLOAT_MANT_BITS);
printf("Exponent Bias:     %d\n", Float_exp_bias);
printf("Exponent Max:      %d\n", Float_exp_max);
printf("Mantissa Max:      %d\n", Float_mant_max);

Sign Bits:         1
Exponent Bits:     3
Mantissa Bits:     5
Exponent Bias:     3
Exponent Max:      7
Mantissa Max:      31


(int) 22


In [21]:
print_named_Float("Float_zero:        ", Float_zero);
print_named_Float("Float_minus_zero:  ", Float_minus_zero);
print_named_Float("Float_infty:       ", Float_infty);
print_named_Float("Float_minus_infty: ", Float_minus_infty);
print_named_Float("Float_nan:         ", Float_nan);
print_named_Float("Float_one:         ", Float_one);

Float_zero:        Float(0, 0, 0) = 0.0000 * 2^(-2) = 0
Float_minus_zero:  Float(1, 0, 0) = -0.0000 * 2^(-2) = 0
Float_infty:       Float(0, 7, 0) = +infty
Float_minus_infty: Float(1, 7, 0) = -infty
Float_nan:         Float(0, 7, 1) = NaN
Float_one:         Float(0, 3, 0) = 1.0000 * 2^(0) = 1


(void) @0x7f3d1e7fae08


In [22]:
puts("Positive Floats:");
Float current = Float_zero;
Float stop = {0, Float_exp_max, Float_mant_max};
while (!Float_binary_equal(current, stop)) {
  print_named_Float("  ", current);
  current = Float_next(current);
}
print_named_Float("  ", stop);

puts("Negative Floats:");
current = Float_minus_zero;
stop.sign = 1;
while (!Float_binary_equal(current, stop)) {
  print_named_Float("  ", current);
  current = Float_next(current);
}
print_named_Float("  ", stop);

Positive Floats:
  Float(0, 0, 0) = 0.0000 * 2^(-2) = 0
  Float(0, 0, 1) = 0.0312 * 2^(-2) = 0.0078
  Float(0, 0, 2) = 0.0625 * 2^(-2) = 0.015625
  Float(0, 0, 3) = 0.0937 * 2^(-2) = 0.023425
  Float(0, 0, 4) = 0.1250 * 2^(-2) = 0.03125
  Float(0, 0, 5) = 0.1562 * 2^(-2) = 0.03905
  Float(0, 0, 6) = 0.1875 * 2^(-2) = 0.046875
  Float(0, 0, 7) = 0.2187 * 2^(-2) = 0.054675
  Float(0, 0, 8) = 0.2500 * 2^(-2) = 0.0625
  Float(0, 0, 9) = 0.2812 * 2^(-2) = 0.0703
  Float(0, 0, 10) = 0.3125 * 2^(-2) = 0.078125
  Float(0, 0, 11) = 0.3437 * 2^(-2) = 0.085925
  Float(0, 0, 12) = 0.3750 * 2^(-2) = 0.09375
  Float(0, 0, 13) = 0.4062 * 2^(-2) = 0.10155
  Float(0, 0, 14) = 0.4375 * 2^(-2) = 0.109375
  Float(0, 0, 15) = 0.4687 * 2^(-2) = 0.117175
  Float(0, 0, 16) = 0.5000 * 2^(-2) = 0.125
  Float(0, 0, 17) = 0.5312 * 2^(-2) = 0.1328
  Float(0, 0, 18) = 0.5625 * 2^(-2) = 0.140625
  Float(0, 0, 19) = 0.5937 * 2^(-2) = 0.148425
  Float(0, 0, 20) = 0.6250 * 2^(-2) = 0.15625
  Float(0, 0, 21) = 0.6562 * 

  Float(0, 6, 17) = 1.5312 * 2^(3) = 12.2496
  Float(0, 6, 18) = 1.5625 * 2^(3) = 12.5
  Float(0, 6, 19) = 1.5937 * 2^(3) = 12.7496
  Float(0, 6, 20) = 1.6250 * 2^(3) = 13
  Float(0, 6, 21) = 1.6562 * 2^(3) = 13.2496
  Float(0, 6, 22) = 1.6875 * 2^(3) = 13.5
  Float(0, 6, 23) = 1.7187 * 2^(3) = 13.7496
  Float(0, 6, 24) = 1.7500 * 2^(3) = 14
  Float(0, 6, 25) = 1.7812 * 2^(3) = 14.2496
  Float(0, 6, 26) = 1.8125 * 2^(3) = 14.5
  Float(0, 6, 27) = 1.8437 * 2^(3) = 14.7496
  Float(0, 6, 28) = 1.8750 * 2^(3) = 15
  Float(0, 6, 29) = 1.9062 * 2^(3) = 15.2496
  Float(0, 6, 30) = 1.9375 * 2^(3) = 15.5
  Float(0, 6, 31) = 1.9687 * 2^(3) = 15.7496
  Float(0, 7, 0) = +infty
  Float(0, 7, 1) = NaN
  Float(0, 7, 2) = NaN
  Float(0, 7, 3) = NaN
  Float(0, 7, 4) = NaN
  Float(0, 7, 5) = NaN
  Float(0, 7, 6) = NaN
  Float(0, 7, 7) = NaN
  Float(0, 7, 8) = NaN
  Float(0, 7, 9) = NaN
  Float(0, 7, 10) = NaN
  Float(0, 7, 11) = NaN
  Float(0, 7, 12) = NaN
  Float(0, 7, 13) = NaN
  Float(0, 7, 14) = NaN

  Float(1, 4, 17) = -1.5312 * 2^(1) = -3.0624
  Float(1, 4, 18) = -1.5625 * 2^(1) = -3.125
  Float(1, 4, 19) = -1.5937 * 2^(1) = -3.1874
  Float(1, 4, 20) = -1.6250 * 2^(1) = -3.25
  Float(1, 4, 21) = -1.6562 * 2^(1) = -3.3124
  Float(1, 4, 22) = -1.6875 * 2^(1) = -3.375
  Float(1, 4, 23) = -1.7187 * 2^(1) = -3.4374
  Float(1, 4, 24) = -1.7500 * 2^(1) = -3.5
  Float(1, 4, 25) = -1.7812 * 2^(1) = -3.5624
  Float(1, 4, 26) = -1.8125 * 2^(1) = -3.625
  Float(1, 4, 27) = -1.8437 * 2^(1) = -3.6874
  Float(1, 4, 28) = -1.8750 * 2^(1) = -3.75
  Float(1, 4, 29) = -1.9062 * 2^(1) = -3.8124
  Float(1, 4, 30) = -1.9375 * 2^(1) = -3.875
  Float(1, 4, 31) = -1.9687 * 2^(1) = -3.9374
  Float(1, 5, 0) = -1.0000 * 2^(2) = -4
  Float(1, 5, 1) = -1.0312 * 2^(2) = -4.1248
  Float(1, 5, 2) = -1.0625 * 2^(2) = -4.25
  Float(1, 5, 3) = -1.0937 * 2^(2) = -4.3748
  Float(1, 5, 4) = -1.1250 * 2^(2) = -4.5
  Float(1, 5, 5) = -1.1562 * 2^(2) = -4.6248
  Float(1, 5, 6) = -1.1875 * 2^(2) = -4.75
  Float(1, 5, 7) =

(void) @0x7f3d1e7fae08


In [23]:
const Float smallest_positive = {0, 0, 1};
MY_ASSERT(Float_equal(Float_next(Float_zero), smallest_positive));

const Float smallest_negative = {1, 0, 1};
MY_ASSERT(Float_equal(Float_next(Float_minus_zero), smallest_negative));

const Float next_after_one = {0, Float_exp_bias, 1};
MY_ASSERT(Float_equal(Float_next(Float_one), next_after_one));

Float max_positive_subnormal = {0, 0, Float_mant_max};
Float min_positive_normal = {0, 1, 0};
MY_ASSERT(Float_equal(Float_next(max_positive_subnormal), min_positive_normal));

