-
Notifications
You must be signed in to change notification settings - Fork 0
/
Unit_Testing_for_Data_Science_in_Python.py
471 lines (356 loc) · 16.8 KB
/
Unit_Testing_for_Data_Science_in_Python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#######################
# Unit testing basics #
#######################
### Your first unit test using pytest
# Import the pytest package
import pytest
# Import the function convert_to_int()
from preprocessing_helpers import convert_to_int
# Complete the unit test name by adding a prefix
def test_on_string_with_one_comma():
# Complete the assert statement
assert convert_to_int("2,081") == 2081
### Spotting and fixing bugs
def convert_to_int(string_with_comma):
# Fix this line so that it returns an int, not a str
return int(string_with_comma.replace(",", ""))
#############################
# Intermediate unit testing #
#############################
### Write an informative test failure message
import pytest
from preprocessing_helpers import convert_to_int
def test_on_string_with_one_comma():
test_argument = "2,081"
expected = 2081
actual = convert_to_int(test_argument)
# Format the string with the actual return value
message = "convert_to_int('2,081') should return the int 2081, but it actually returned {0}".format(actual)
# Write the assert statement which prints message on failure
assert actual == expected, message
### Testing float return values
import numpy as np
import pytest
from as_numpy import get_data_as_numpy_array
def test_on_clean_file():
expected = np.array([[2081.0, 314942.0],
[1059.0, 186606.0],
[1148.0, 206186.0]
]
)
actual = get_data_as_numpy_array("example_clean_data.txt", num_columns=2)
message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
# Complete the assert statement
assert actual == pytest.approx(expected), message
### Testing with multiple assert statements
def test_on_six_rows():
example_argument = np.array([[2081.0, 314942.0], [1059.0, 186606.0],
[1148.0, 206186.0], [1506.0, 248419.0],
[1210.0, 214114.0], [1697.0, 277794.0]]
)
# Fill in with training array's expected number of rows
expected_training_array_num_rows = 4
# Fill in with testing array's expected number of rows
expected_testing_array_num_rows = 2
actual = split_into_training_and_testing_sets(example_argument)
# Write the assert statement checking training array's number of rows
assert actual[0].shape[0] == expected_training_array_num_rows, "The actual number of rows in the training array is not {}".format(expected_training_array_num_rows)
# Write the assert statement checking testing array's number of rows
assert actual[1].shape[1] == 2, "The actual number of rows in the testing array is not {}".format(expected_testing_array_num_rows)
### Practice the context manager
import pytest
# Fill in with a context manager that will silence the ValueError
with pytest.raises(ValueError):
raise ValueError
try:
# Fill in with a context manager that raises Failed if no OSError is raised
with pytest.raises(OSError):
raise ValueError
except:
print("pytest raised an exception because no OSError was raised in the context.")
### Unit test a ValueError
import numpy as np
import pytest
from train import split_into_training_and_testing_sets
def test_on_one_row():
test_argument = np.array([[1382.0, 390167.0]])
# Fill in with a context manager for checking ValueError
with pytest.raises(ValueError):
split_into_training_and_testing_sets(test_argument)
def test_on_one_row():
test_argument = np.array([[1382.0, 390167.0]])
# Store information about raised ValueError in exc_info
with pytest.raises(ValueError) as exc_info:
split_into_training_and_testing_sets(test_argument)
expected_error_msg = "Argument data_array must have at least 2 rows, it actually has just 1"
# Check if the raised ValueError contains the correct message
assert exc_info.match(expected_error_msg)
### Testing well: Boundary values
import pytest
from preprocessing_helpers import row_to_list
def test_on_no_tab_no_missing_value(): # (0, 0) boundary value
# Assign actual to the return value for the argument "123\n"
actual = row_to_list("123\n")
assert actual is None, "Expected: None, Actual: {0}".format(actual)
def test_on_two_tabs_no_missing_value(): # (2, 0) boundary value
actual = row_to_list("123\t4,567\t89\n")
# Complete the assert statement
assert actual is None, "Expected: None, Actual: {0}".format(actual)
def test_on_one_tab_with_missing_value(): # (1, 1) boundary value
actual = row_to_list("\t4,567\n")
# Format the failure message
assert actual is None, "Expected: None, Actual: {0}".format(actual)
### Testing well: Values triggering special logic
import pytest
from preprocessing_helpers import row_to_list
def test_on_no_tab_with_missing_value(): # (0, 1) case
# Assign to the actual return value for the argument "\n"
actual = row_to_list("\n")
# Write the assert statement with a failure message
assert actual is None, "Expected: None, Actual: {0}".format(actual)
def test_on_two_tabs_with_missing_value(): # (2, 1) case
# Assign to the actual return value for the argument "123\t\t89\n"
actual = row_to_list("123\t\t89\n")
# Write the assert statement with a failure message
assert actual is None, "Expected: None, Actual: {0}".format(actual)
### Testing well: Normal arguments
import pytest
from preprocessing_helpers import row_to_list
def test_on_normal_argument_1():
actual = row_to_list("123\t4,567\n")
# Fill in with the expected return value for the argument "123\t4,567\n"
expected = ["123", "4,567"]
assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)
def test_on_normal_argument_2():
actual = row_to_list("1,059\t186,606\n")
expected = ["1,059", "186,606"]
# Write the assert statement along with a failure message
assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)
### TDD: Tests for normal arguments
def test_with_no_comma():
actual = convert_to_int("756")
# Complete the assert statement
assert actual == 756, "Expected: 756, Actual: {0}".format(actual)
def test_with_one_comma():
actual = convert_to_int("2,081")
# Complete the assert statement
assert actual == 2081, "Expected: 2081, Actual: {0}".format(actual)
def test_with_two_commas():
actual = convert_to_int("1,034,891")
# Complete the assert statement
assert actual == 1034891, "Expected: 1034891, Actual: {0}".format(actual)
### TDD: Requirement collection
# Give a name to the test for an argument with missing comma
def test_on_string_with_missing_comma():
actual = convert_to_int("178100,301")
assert actual is None, "Expected: None, Actual: {0}".format(actual)
def test_on_string_with_incorrectly_placed_comma():
# Assign to the actual return value for the argument "12,72,891"
actual = convert_to_int("12,72,891")
assert actual is None, "Expected: None, Actual: {0}".format(actual)
def test_on_float_valued_string():
actual = convert_to_int("23,816.92")
# Complete the assert statement
assert actual is None, "Expected: None, Actual: {0}".format(actual)
### TDD: Implement the function
def convert_to_int(integer_string_with_commas):
comma_separated_parts = integer_string_with_commas.split(",")
for i in range(len(comma_separated_parts)):
# Write an if statement for checking missing commas
if len(comma_separated_parts[i]) > 3:
return None
# Write the if statement for incorrectly placed commas
if i != 0 and len(comma_separated_parts[i]) != 3:
return None
integer_string_without_commas = "".join(comma_separated_parts)
try:
return int(integer_string_without_commas)
# Fill in with a ValueError
except ValueError:
return None
###################################
# Test Organization and Execution #
###################################
### Create a test class
import pytest
import numpy as np
from models.train import split_into_training_and_testing_sets
# Declare the test class
class TestSplitIntoTrainingAndTestingSets(object):
# Fill in with the correct mandatory argument
def test_on_one_row(self):
test_argument = np.array([[1382.0, 390167.0]])
with pytest.raises(ValueError) as exc_info:
split_into_training_and_testing_sets(test_argument)
expected_error_msg = "Argument data_array must have at least 2 rows, it actually has just 1"
assert exc_info.match(expected_error_msg)
### Running test classes
import numpy as np
def split_into_training_and_testing_sets(data_array):
dim = data_array.ndim
if dim != 2:
raise ValueError("Argument data_array must be two dimensional. Got {0} dimensional array instead!".format(dim))
num_rows = data_array.shape[0]
if num_rows < 2:
raise ValueError("Argument data_array must have at least 2 rows, it actually has just {0}".format(num_rows))
# Fill in with the correct float
num_training = int(0.75 * data_array.shape[0])
permuted_indices = np.random.permutation(data_array.shape[0])
return data_array[permuted_indices[:num_training], :], data_array[permuted_indices[num_training:], :]
### Mark a test class as expected to fail
# Mark the whole test class as "expected to fail"
@pytest.mark.xfail
class TestModelTest(object):
def test_on_linear_data(self):
test_input = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]])
expected = 1.0
actual = model_test(test_input, 2.0, 1.0)
message = "model_test({0}) should return {1}, but it actually returned {2}".format(test_input, expected, actual)
assert actual == pytest.approx(expected), message
def test_on_one_dimensional_array(self):
test_input = np.array([1.0, 2.0, 3.0, 4.0])
with pytest.raises(ValueError) as exc_info:
model_test(test_input, 1.0, 1.0)
# Add a reason for the expected failure
@pytest.mark.xfail(reason="Using TDD, model_test() has not yet been implemented")
class TestModelTest(object):
def test_on_linear_data(self):
test_input = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]])
expected = 1.0
actual = model_test(test_input, 2.0, 1.0)
message = "model_test({0}) should return {1}, but it actually returned {2}".format(test_input, expected, actual)
assert actual == pytest.approx(expected), message
def test_on_one_dimensional_array(self):
test_input = np.array([1.0, 2.0, 3.0, 4.0])
with pytest.raises(ValueError) as exc_info:
model_test(test_input, 1.0, 1.0)
### Mark a test as conditionally skipped
# Import the sys module
import sys
class TestGetDataAsNumpyArray(object):
# Mark as skipped if Python version is greater than 2.7
@pytest.mark.skipif(sys.version_info > (2, 7))
def test_on_clean_file(self):
expected = np.array([[2081.0, 314942.0],
[1059.0, 186606.0],
[1148.0, 206186.0]
]
)
actual = get_data_as_numpy_array("example_clean_data.txt", num_columns=2)
message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
assert actual == pytest.approx(expected), message
class TestGetDataAsNumpyArray(object):
# Add a reason for skipping the test
@pytest.mark.skipif(sys.version_info > (2, 7), reason="Works only on Python 2.7 or lower")
def test_on_clean_file(self):
expected = np.array([[2081.0, 314942.0],
[1059.0, 186606.0],
[1148.0, 206186.0]
]
)
actual = get_data_as_numpy_array("example_clean_data.txt", num_columns=2)
message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
assert actual == pytest.approx(expected), message
#######################################
# Testing Models, Plots and Much More #
#######################################
### Use a fixture for a clean data file
# Add a decorator to make this function a fixture
@pytest.fixture
def clean_data_file():
file_path = "clean_data_file.txt"
with open(file_path, "w") as f:
f.write("201\t305671\n7892\t298140\n501\t738293\n")
yield file_path
os.remove(file_path)
# Pass the correct argument so that the test can use the fixture
def test_on_clean_file(clean_data_file):
expected = np.array([[201.0, 305671.0], [7892.0, 298140.0], [501.0, 738293.0]])
# Pass the clean data file path yielded by the fixture as the first argument
actual = get_data_as_numpy_array(clean_data_file, 2)
assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual)
### Write a fixture for an empty data file
@pytest.fixture
def empty_file():
# Assign the file path "empty.txt" to the variable
file_path = "empty.txt"
open(file_path, "w").close()
# Yield the variable file_path
yield file_path
# Remove the file in the teardown
os.remove(file_path)
def test_on_empty_file(self, empty_file):
expected = np.empty((0, 2))
actual = get_data_as_numpy_array(empty_file, 2)
assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual)
### Fixture chaining using tmpdir
import pytest
@pytest.fixture
# Add the correct argument so that this fixture can chain with the tmpdir fixture
def empty_file(tmpdir):
# Use the appropriate method to create an empty file in the temporary directory
file_path = tmpdir.join("empty.txt")
open(file_path, "w").close()
yield file_path
### Program a bug-free dependency
# Define a function convert_to_int_bug_free
def convert_to_int_bug_free(comma_separated_integer_string):
# Assign to the dictionary holding the correct return values
return_values = {"1,801": 1801, "201,411": 201411, "2,002": 2002, "333,209": 333209, "1990": None, "782,911": 782911, "1,285": 1285, "389129": None}
# Return the correct result using the dictionary return_values
return return_values[comma_separated_integer_string]
### Mock a dependency
# Add the correct argument to use the mocking fixture in this test
def test_on_raw_data(self, raw_and_clean_data_file, mocker):
raw_path, clean_path = raw_and_clean_data_file
# Replace the dependency with the bug-free mock
convert_to_int_mock = mocker.patch("data.preprocessing_helpers.convert_to_int",
side_effect=convert_to_int_bug_free)
preprocess(raw_path, clean_path)
# Check if preprocess() called the dependency correctly
assert convert_to_int_mock.call_args_list == [call("1,801"), call("201,411"), call("2,002"), call("333,209"), call("1990"), call("782,911"), call("1,285"), call("389129")]
with open(clean_path, "r") as f:
lines = f.readlines()
first_line = lines[0]
assert first_line == "1801\\t201411\\n"
second_line = lines[1]
assert second_line == "2002\\t333209\\n"
### Testing on linear data
import numpy as np
import pytest
from models.train import model_test
def test_on_perfect_fit():
# Assign to a NumPy array containing a linear testing set
test_argument = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]])
# Fill in with the expected value of r^2 in the case of perfect fit
expected = 1
# Fill in with the slope and intercept of the model
actual = model_test(test_argument, slope=2.0, intercept=1.0)
# Complete the assert statement
assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual)
### Testing on circular data
def test_on_circular_data(self):
theta = pi/4.0
# Assign to a NumPy array holding the circular testing data
test_argument = np.array([[1.0, 0.0], [cos(theta), sin(theta)],
[0.0, 1.0],
[cos(3 * theta), sin(3 * theta)],
[-1.0, 0.0],
[cos(5 * theta), sin(5 * theta)],
[0.0, -1.0],
[cos(7 * theta), sin(7 * theta)]]
)
# Fill in with the slope and intercept of the straight line
actual = model_test(test_argument, slope=0.0, intercept=0.0)
# Complete the assert statement
assert actual == pytest.approx(0.0)
### Fix the plotting function
import matplotlib.pyplot as plt
import numpy as np
def get_plot_for_best_fit_line(slope, intercept, x_array, y_array, title):
fig, ax = plt.subplots()
ax.plot(x_array, y_array, ".")
ax.plot([0, np.max(x_array)], [intercept, slope * np.max(x_array) + intercept], "-")
# Fill in with axis labels so that they match the baseline
ax.set(xlabel="area (square feet)", ylabel="price (dollars)", title=title)
return fig