forked from tempesta-tech/tempesta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
linux-4.1-tfw.patch
2250 lines (2102 loc) · 65.9 KB
/
linux-4.1-tfw.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index f4b78ea..b10de96 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -1,5 +1,5 @@
-
- Linux kernel coding style
+ Tempesta coding style
+ (based on Linux kernel coding style)
This is a short document describing the preferred coding style for the
linux kernel. Coding style is very personal, and I won't _force_ my
@@ -86,6 +86,17 @@ are placed substantially to the right. The same applies to function headers
with a long argument list. However, never break user-visible strings such as
printk messages, because that breaks the ability to grep for them.
+Functions calls on multiple lines must have the arguments aligned properly on
+the second and subsequent lines (see https://lkml.org/lkml/2013/6/6/703),
+this means:
+
+ function(arg1, arg2,
+ arg3, arg4);
+
+You must use the appropriate number of TAB and space characters to achieve this
+proper column alignment, rather than only using TAB characters as you have done
+here.
+
Chapter 3: Placing Braces and Spaces
@@ -116,7 +127,8 @@ while, do). E.g.:
However, there is one special case, namely functions: they have the
opening brace at the beginning of the next line, thus:
- int function(int x)
+ int
+ function(int x)
{
body of function
}
@@ -139,9 +151,11 @@ and
if (x == y) {
..
- } else if (x > y) {
+ }
+ else if (x > y) {
...
- } else {
+ }
+ else {
....
}
@@ -153,6 +167,9 @@ supply of new-lines on your screen is not a renewable resource (think
25-line terminal screens here), you have more empty lines to put
comments on.
+However, please note that else-if statements begins at new line to make
+conditions more explicit and improve code readability.
+
Do not unnecessarily use braces where a single statement will do.
if (condition)
@@ -272,76 +289,27 @@ See chapter 6 (Functions).
Chapter 5: Typedefs
-Please don't use things like "vps_t".
-It's a _mistake_ to use typedef for structures and pointers. When you see a
+Linux kernel frowns on CamelCase and so we in general, but type names.
+Tempesta uses CamelCase usually prefixed with "Tfw" for type names and uses
+typedefs for them. So instead of
vps_t a;
-
-in the source, what does it mean?
-In contrast, if it says
-
struct virtual_container *a;
-you can actually tell what "a" is.
-
-Lots of people think that typedefs "help readability". Not so. They are
-useful only for:
-
- (a) totally opaque objects (where the typedef is actively used to _hide_
- what the object is).
-
- Example: "pte_t" etc. opaque objects that you can only access using
- the proper accessor functions.
-
- NOTE! Opaqueness and "accessor functions" are not good in themselves.
- The reason we have them for things like pte_t etc. is that there
- really is absolutely _zero_ portably accessible information there.
-
- (b) Clear integer types, where the abstraction _helps_ avoid confusion
- whether it is "int" or "long".
-
- u8/u16/u32 are perfectly fine typedefs, although they fit into
- category (d) better than here.
-
- NOTE! Again - there needs to be a _reason_ for this. If something is
- "unsigned long", then there's no reason to do
-
- typedef unsigned long myflags_t;
-
- but if there is a clear reason for why it under certain circumstances
- might be an "unsigned int" and under other configurations might be
- "unsigned long", then by all means go ahead and use a typedef.
-
- (c) when you use sparse to literally create a _new_ type for
- type-checking.
-
- (d) New types which are identical to standard C99 types, in certain
- exceptional circumstances.
+we write
- Although it would only take a short amount of time for the eyes and
- brain to become accustomed to the standard types like 'uint32_t',
- some people object to their use anyway.
+ TfwVps a;
+ TfwVirtualContainer *a;
- Therefore, the Linux-specific 'u8/u16/u32/u64' types and their
- signed equivalents which are identical to standard types are
- permitted -- although they are not mandatory in new code of your
- own.
+CamelCase helps to identify type name from other names while "_t" suffix just
+makes the name longer. We also use C++ for user-space code and this is quite
+unusual practice in C++ to write something like
- When editing existing code which already uses one or the other set
- of types, you should conform to the existing choices in that code.
+ struct A a(0);
+ class B b(a);
- (e) Types safe for use in userspace.
-
- In certain structures which are visible to userspace, we cannot
- require C99 types and cannot use the 'u32' form above. Thus, we
- use __u32 and similar types in all structures which are shared
- with userspace.
-
-Maybe there are other cases too, but the rule should basically be to NEVER
-EVER use a typedef unless you can clearly match one of those rules.
-
-In general, a pointer, or a struct that has elements that can reasonably
-be directly accessed should _never_ be a typedef.
+And we're trying to be good for C as well as for C++. Typedefs make C code
+look closer to C++ and that's also good for us.
Chapter 6: Functions
@@ -375,7 +343,8 @@ In source files, separate functions with one blank line. If the function is
exported, the EXPORT* macro for it should follow immediately after the closing
function brace line. E.g.:
- int system_is_up(void)
+ int
+ system_is_up(void)
{
return system_state == SYSTEM_RUNNING;
}
@@ -385,6 +354,31 @@ In function prototypes, include parameter names with their data types.
Although this is not required by the C language, it is preferred in Linux
because it is a simple way to add valuable information for the reader.
+Let's consider following *bad* example of C++ function definition:
+
+ template<typename T> inline SomeOtherTemplate<arg1, arg2> the_function(
+ int a, char b, char c)
+ {
+ .....
+ }
+
+In this example function type occupies almost full line, so there is no space
+for name of the function and its arguments. To make function declarations more
+convinient we place template argumenta at first line, function return type at
+the second one and the function name with its arguments at the last line:
+
+ template<typename T>
+ inline SomeOtherTemplate<arg1, arg2>
+ the_function(int a, char b,
+ char c)
+ {
+ .....
+ }
+
+Please note that if some function arguments ("c" at the example above) don't
+fit one line, they are placed at the next line with identation by tabs and
+spaces to the first argument.
+
Chapter 7: Centralized exiting of functions
@@ -408,7 +402,8 @@ The rationale for using gotos is:
modifications are prevented
- saves the compiler work to optimize redundant code away ;)
- int fun(int a)
+ int
+ fun(int a)
{
int result = 0;
char *buffer;
@@ -462,7 +457,8 @@ See the files Documentation/kernel-doc-nano-HOWTO.txt and scripts/kernel-doc
for details.
Linux style for comments is the C89 "/* ... */" style.
-Don't use C99-style "// ..." comments.
+However, in application-level code you can use C99 or C++-style "// ..."
+comments.
The preferred style for long (multi-line) comments is:
@@ -475,16 +471,6 @@ The preferred style for long (multi-line) comments is:
* with beginning and ending almost-blank lines.
*/
-For files in net/ and drivers/net/ the preferred style for long (multi-line)
-comments is a little different.
-
- /* The preferred comment style for files in net/ and drivers/net
- * looks like this.
- *
- * It is nearly the same as the generally preferred comment style,
- * but there is no initial almost-blank line.
- */
-
It's also important to comment data, whether they are basic types or derived
types. To this end, use just one data declaration per line (no commas for
multiple data declarations). This leaves you room for a small comment on each
@@ -568,7 +554,15 @@ config AUDIT
logging of avc messages output). Does not do system-call
auditing without CONFIG_AUDITSYSCALL.
-Seriously dangerous features (such as write support for certain
+Features that might still be considered unstable should be defined as
+dependent on "EXPERIMENTAL":
+
+config SLUB
+ depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT
+ bool "SLUB (Unqueued Allocator)"
+ ...
+
+while seriously dangerous features (such as write support for certain
filesystems) should advertise this prominently in their prompt string:
config ADFS_FS_RW
@@ -716,9 +710,8 @@ used.
Chapter 14: Allocating memory
The kernel provides the following general purpose memory allocators:
-kmalloc(), kzalloc(), kmalloc_array(), kcalloc(), vmalloc(), and
-vzalloc(). Please refer to the API documentation for further information
-about them.
+kmalloc(), kzalloc(), kcalloc(), vmalloc(), and vzalloc(). Please refer to
+the API documentation for further information about them.
The preferred form for passing a size of a struct is the following:
@@ -732,17 +725,6 @@ Casting the return value which is a void pointer is redundant. The conversion
from void pointer to any other pointer type is guaranteed by the C programming
language.
-The preferred form for allocating an array is the following:
-
- p = kmalloc_array(n, sizeof(...), ...);
-
-The preferred form for allocating a zeroed array is the following:
-
- p = kcalloc(n, sizeof(...), ...);
-
-Both forms check for overflow on the allocation size n * sizeof(...),
-and return NULL if that occurred.
-
Chapter 15: The inline disease
@@ -823,7 +805,109 @@ need them. Feel free to peruse that header file to see what else is already
defined that you shouldn't reproduce in your code.
- Chapter 18: Editor modelines and other cruft
+ Chapter 18: C++ and other stuff
+
+We use the same coding style for kernel C and application C++ programming, so
+we adjust the original guide with some C++ specific things.
+
+Althought Linux kernel doesn't like capital letters and typedefs for structures,
+it is uncommon for C++ to write
+
+ struct virtual_container *a;
+
+or similarly
+
+ class virtual_container *a;
+
+so to keep consistency between structures and classes we should write
+
+ virtual_container_t *a;
+
+where virtual_container_t is a class typedef'ed structure. To be able to tell
+what "a" is we name classes and structures the same way, using CamelCase.
+So "a" becomes
+
+ VirtualContainer *a.
+
+C++ projects are often includes C, C++/STL, Boost and/or other header files.
+It is desired to sort them as: firstly C-headers, then standard C++ headers,
+headers from other standard libraries and lastly the project's internal headers:
+
+ #include <stdio.h>
+ #include <stdlib.h>
+
+ #include <iostream>
+ #include <vector>
+
+ #include <boost/bind.hpp>
+
+ #include "daemon.h"
+ #include "mem.h"
+
+It is also good idea to sort includes alhpabetically.
+
+By the way, C++ is very powerfull tool which in many cases uses TIMTOWTDI (There
+Is More Than One Way To Do It) concept. For example in some cases you can use
+simple C-array, STL vector or Boost array. In such cases keep you code as simple
+as possibly. It means that if there is no difference, then you should prefer
+C-array to STL vector and STL vector to Boost array.
+
+If a code operates with many function pointers, then it is sometimes difficult
+to find which function a pointer reffers to. So to simplify code navigation
+function pointer types must be defined as <type_name>_t, functions which are
+used with the types should be named as <some_prefix>_<type_name> and variables
+of the type as <some_prefix>_<type_name> or <type_name>_<some_suffix>. E.g.:
+
+ typedef void (*some_func_t)(void);
+
+ void xxx_some_func(void) {};
+ void yyy_some_func(void) {};
+
+ some_func_t var1_some_func = xxx_some_func;
+ some_func_t some_func_var2 = yyy_some_func;
+
+This way it is pretty simple to grep whole code for all functions which can be
+refered by "var1_some_func". Exception of the rule could be variable names
+defined and used in one function scope.
+
+In Linux kernel pointer specifier "*" is placed near to the argument name.
+There is no reason to treat C++ reference "&" differently. So prefer this style
+
+ int &func();
+ int *a, &b = a;
+
+to this *bad* style:
+
+ int& func();
+ int* a, & b = a;
+
+In C++ class methods (special type of function) can operate with global, local,
+or the class memebers. So to differentiate class members from global and local
+variables it is a good idea to use "_" suffix:
+
+ class A {
+ int a_;
+ };
+
+Exception safiness. All functions, which don't throw exceptions, mandatory MUST
+specify it with noexcept statement:
+
+ void func() noexcept;
+
+Yes, it makes risk to get unexpected exception. But such exceptions must be
+tested and fixed during test phase. And, yes, each time when you need to modify
+source code of a function you have to check that your changes don't affect
+the function exception specification. However, it is much simplier to analyze
+which work flow could lead to and to which exceptions.
+
+Do not use following stupid safe programming for conditions:
+
+ if (5 == a)
+
+It makes code hard to read, but prevents very rare type of misprint.
+
+
+ Chapter 19: Editor modelines and other cruft
Some editors can interpret configuration information embedded in source files,
indicated with special markers. For example, emacs interprets lines marked
@@ -850,7 +934,7 @@ own custom mode, or may have some other magic method for making indentation
work correctly.
- Chapter 19: Inline assembly
+ Chapter 20: Inline assembly
In architecture-specific code, you may need to use inline assembly to interface
with CPU or platform functionality. Don't hesitate to do so when necessary.
@@ -879,7 +963,7 @@ next instruction in the assembly output:
: /* outputs */ : /* inputs */ : /* clobbers */);
- Chapter 20: Conditional Compilation
+ Chapter 21: Conditional Compilation
Wherever possible, don't use preprocessor conditionals (#if, #ifdef) in .c
files; doing so makes code harder to read and logic harder to follow. Instead,
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cd03a0f..6465166 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3554,6 +3554,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
tdfx= [HW,DRM]
+ tempesta_dbmem= [KNL]
+ Order of 2MB memory blocks reserved on each NUMA node
+ for Tempesta database. Huge pages are used if
+ possible.
+
test_suspend= [SUSPEND][,N]
Specify "mem" (for Suspend-to-RAM) or "standby" (for
standby suspend) or "freeze" (for suspend type freeze)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6c9cb60..505dfd3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1128,7 +1128,11 @@ ENTRY(do_softirq_own_stack)
incl PER_CPU_VAR(irq_count)
cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
+#ifdef CONFIG_SECURITY_TEMPESTA
+ call __tempesta_do_softirq_fpusafe
+#else
call __do_softirq
+#endif
leaveq
CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 394e643..7c7f954 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -74,6 +74,26 @@ static inline void stack_overflow_check(struct pt_regs *regs)
#endif
}
+#ifdef CONFIG_SECURITY_TEMPESTA
+/* Tempesta supports x86-64 only. */
+#include <asm/i387.h>
+
+void
+__tempesta_do_softirq_fpusafe(void)
+{
+ /*
+ * Switch FPU context once per budget packets to let Tempesta
+ * run many vector operations w/o costly FPU switches.
+ * Eager FPU must be enabled.
+ */
+ kernel_fpu_begin();
+
+ __do_softirq();
+
+ kernel_fpu_end();
+}
+#endif
+
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
struct irq_desc *desc;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 87a815b..84b7399 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -565,7 +565,7 @@ static void __init setup_init_fpu_buf(void)
xsave_state_booting(init_xstate_buf, -1);
}
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = ENABLE;
static int __init eager_fpu_setup(char *s)
{
if (!strcmp(s, "on"))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 950ae45..0f1f587 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -402,13 +402,13 @@ extern bool force_irqthreads;
tasklets are more than enough. F.e. all serial device BHs et
al. should be converted to tasklets, not to softirqs.
*/
-
+/* Tempesta: process RX before TX to proxy traffic in one softirq shot. */
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
- NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
+ NET_TX_SOFTIRQ,
BLOCK_SOFTIRQ,
BLOCK_IOPOLL_SOFTIRQ,
TASKLET_SOFTIRQ,
@@ -452,7 +452,7 @@ extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
-extern void raise_softirq(unsigned int nr);
+void raise_softirq(unsigned int nr);
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
diff --git a/include/linux/net.h b/include/linux/net.h
index 738ea48..7944e97 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -192,6 +192,8 @@ struct net_proto_family {
struct module *owner;
};
+extern const struct net_proto_family *get_proto_family(int family);
+
struct iovec;
struct kvec;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4307e20..f915c87 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -538,8 +538,12 @@ struct sk_buff {
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
+ *
+ * Tempesta. Extend the control block from original 48 bytes to
+ * 64, so we can place our own control block at the end of @cb
+ * and safely pass the skb to TCP and IP layers.
*/
- char cb[48] __aligned(8);
+ char cb[64] __aligned(8);
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
@@ -567,8 +571,10 @@ struct sk_buff {
fclone:2,
peeked:1,
head_frag:1,
+#ifdef CONFIG_SECURITY_TEMPESTA
+ skb_page:1,
+#endif
xmit_more:1;
- /* one bit hole */
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
@@ -3452,5 +3458,28 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
skb_network_header(skb);
return hdr_len + skb_gso_transport_seglen(skb);
}
+
+/*
+ * ------------------------------------------------------------------------
+ * Tempesta FW
+ * ------------------------------------------------------------------------
+ */
+/*
+ * We use this additional skb list to be able to reference skbs which are
+ * processed by standard Linux TCP/IP stack w/o skb cloning.
+ */
+typedef struct {
+ struct sk_buff *next;
+ struct sk_buff *prev;
+} SsSkbCb;
+
+#define TFW_SKB_CB(s) ((SsSkbCb *)((s)->cb + sizeof((s)->cb) \
+ - sizeof(SsSkbCb)))
+#define TFW_SKB_CB_INIT(skb) \
+do { \
+ TFW_SKB_CB(skb)->prev = NULL; \
+ TFW_SKB_CB(skb)->next = NULL; \
+} while (0)
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
diff --git a/include/linux/tempesta.h b/include/linux/tempesta.h
new file mode 100644
index 0000000..55049bd
--- /dev/null
+++ b/include/linux/tempesta.h
@@ -0,0 +1,54 @@
+/**
+ * Linux interface for Tempesta FW.
+ *
+ * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com).
+ * Copyright (C) 2015-2016 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#ifndef __TEMPESTA_H__
+#define __TEMPESTA_H__
+
+#include <net/sock.h>
+
+typedef void (*TempestaTxAction)(void);
+
+typedef struct {
+ int (*sk_alloc)(struct sock *sk);
+ void (*sk_free)(struct sock *sk);
+ int (*sock_tcp_rcv)(struct sock *sk, struct sk_buff *skb);
+} TempestaOps;
+
+typedef struct {
+ unsigned long addr;
+ unsigned long pages; /* number of 4KB pages */
+} TempestaMapping;
+
+/* Security hooks. */
+int tempesta_new_clntsk(struct sock *newsk);
+void tempesta_register_ops(TempestaOps *tops);
+void tempesta_unregister_ops(TempestaOps *tops);
+
+/* Network hooks. */
+void tempesta_set_tx_action(TempestaTxAction action);
+void tempesta_del_tx_action(void);
+
+/* Memory management. */
+void tempesta_reserve_pages(void);
+void tempesta_reserve_vmpages(void);
+int tempesta_get_mapping(int node, TempestaMapping **tm);
+
+#endif /* __TEMPESTA_H__ */
+
diff --git a/include/net/sock.h b/include/net/sock.h
index ed01a01..60a4392 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -849,9 +849,14 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
return sk->sk_backlog_rcv(sk, skb);
}
+#define TFW_SK_CPU_INIT USHRT_MAX
+
static inline void sk_incoming_cpu_update(struct sock *sk)
{
- sk->sk_incoming_cpu = raw_smp_processor_id();
+#ifdef CONFIG_SECURITY_TEMPESTA
+ if (sk->sk_incoming_cpu == TFW_SK_CPU_INIT)
+#endif
+ sk->sk_incoming_cpu = raw_smp_processor_id();
}
static inline void sock_rps_record_flow_hash(__u32 hash)
@@ -1697,8 +1702,7 @@ unsigned long sock_i_ino(struct sock *sk);
static inline struct dst_entry *
__sk_dst_get(struct sock *sk)
{
- return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ return rcu_dereference_raw(sk->sk_dst_cache);
}
static inline struct dst_entry *
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d204f3..ab4c696 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -340,6 +340,7 @@ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
}
extern struct proto tcp_prot;
+extern struct proto tcpv6_prot;
#define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define TCP_INC_STATS_BH(net, field) SNMP_INC_STATS_BH((net)->mib.tcp_statistics, field)
@@ -578,6 +579,16 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);
+/* Routines required by Tempesta FW. */
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
+extern void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
+ int size_goal);
+extern int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
+extern void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
+extern void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
+extern void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
+extern int tcp_close_state(struct sock *sk);
+
/* Read 'sendfile()'-style from a TCP socket */
typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
unsigned int, size_t);
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1a85940..909e081 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -27,6 +27,7 @@
#define NETLINK_ECRYPTFS 19
#define NETLINK_RDMA 20
#define NETLINK_CRYPTO 21 /* Crypto layer */
+#define NETLINK_TEMPESTA 22
#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG
diff --git a/init/main.c b/init/main.c
index 2a89545..0a24b7d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -88,6 +88,8 @@
#include <asm/sections.h>
#include <asm/cacheflush.h>
+#include <linux/tempesta.h>
+
static int kernel_init(void *);
extern void init_IRQ(void);
@@ -482,11 +484,25 @@ static void __init mm_init(void)
*/
page_ext_init_flatmem();
mem_init();
+
+ /*
+ * Tempesta: reserve pages just when zones are initialized
+ * to get continous address space of huge pages.
+ */
+#ifdef CONFIG_SECURITY_TEMPESTA
+ tempesta_reserve_pages();
+#endif
+
kmem_cache_init();
percpu_init_late();
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+
+ /* Try vmalloc() if the previous one failed. */
+#ifdef CONFIG_SECURITY_TEMPESTA
+ tempesta_reserve_vmpages();
+#endif
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e443..f951010 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -423,6 +423,7 @@ void raise_softirq(unsigned int nr)
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}
+EXPORT_SYMBOL(raise_softirq);
void __raise_softirq_irqoff(unsigned int nr)
{
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eae..33cf154 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_SECURITY_TEMPESTA) += tempesta_mm.o
diff --git a/mm/tempesta_mm.c b/mm/tempesta_mm.c
new file mode 100644
index 0000000..3909f7f
--- /dev/null
+++ b/mm/tempesta_mm.c
@@ -0,0 +1,284 @@
+/**
+ * Tempesta Memory Reservation
+ *
+ * Copyright (C) 2015 Tempesta Technologies, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/gfp.h>
+#include <linux/hugetlb.h>
+#include <linux/tempesta.h>
+#include <linux/topology.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+
+#define MAX_PGORDER 16 /* 128GB per one table */
+#define MIN_PGORDER 0 /* 2MB - one extent */
+#define DEFAULT_PGORDER 8 /* 512MB */
+/* Modern processors support up to 1.5TB of RAM, be ready for 2TB. */
+#define GREEDY_ARNUM (1024 * 1024 + 1)
+#define PGNUM (1 << pgorder)
+#define PGNUM4K (PGNUM * (1 << HUGETLB_PAGE_ORDER))
+
+static int pgorder = DEFAULT_PGORDER;
+static gfp_t gfp_f = GFP_HIGHUSER | __GFP_COMP | __GFP_THISNODE | __GFP_ZERO
+ | __GFP_REPEAT |__GFP_NOWARN;
+static TempestaMapping map[MAX_NUMNODES];
+/*
+ * Modern x86-64 has not more than 512GB RAM per physical node.
+ * This is very large amount of memory, but it will be freed when
+ * initialization phase ends.
+ */
+static struct page *greedy[GREEDY_ARNUM] __initdata = { 0 };
+
+static int __init
+tempesta_setup_pages(char *str)
+{
+ get_option(&str, &pgorder);
+ if (pgorder < MIN_PGORDER) {
+ pr_err("Tempesta: bad dbmem value %d, must be [%d:%d]\n",
+ pgorder, MIN_PGORDER, MAX_PGORDER);
+ pgorder = MIN_PGORDER;
+ }
+ if (pgorder > MAX_PGORDER) {
+ pr_err("Tempesta: bad dbmem value %d, must be [%d:%d]\n",
+ pgorder, MIN_PGORDER, MAX_PGORDER);
+ pgorder = MAX_PGORDER;
+ }
+
+ return 1;
+}
+__setup("tempesta_dbmem=", tempesta_setup_pages);
+
+/**
+ * The code is somewhat stollen from mm/hugetlb.c.
+ */
+static struct page *
+tempesta_alloc_hpage(int nid)
+{
+ struct page *p;
+
+ p = alloc_pages_exact_node(nid, gfp_f, HUGETLB_PAGE_ORDER);
+ if (!p)
+ return NULL;
+
+ if (arch_prepare_hugepage(p)) {
+ pr_err("Tempesta: cannot prepare hugepage %p at node %d\n",
+ p, nid);
+ return NULL;
+ }
+
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+
+ __ClearPageReserved(p);
+ prep_compound_page(p, HUGETLB_PAGE_ORDER);
+
+ /* Acquire the page immediately. */
+ set_page_refcounted(p);
+
+ return p;
+}
+
+static void
+tempesta_free_hpage(struct page *p)
+{
+ __free_pages(p, HUGETLB_PAGE_ORDER);
+}
+
+/**
+ * Greedely alloc huge pages and try to find continous region organized
+ * by sorted set of allocated pages. When the region is found, all pages
+ * out of it are returned to system.
+ */
+static struct page *
+tempesta_alloc_contmem(int nid)
+{
+ long min = -1, start = -1, curr = 0, end = -1, max = -1;
+ struct page *p;
+
+ while (1) {
+ p = tempesta_alloc_hpage(nid);
+ if (!p)
+ goto err;
+ curr = ((long)page_address(p) - PAGE_OFFSET) >> HPAGE_SHIFT;
+ /*
+ * The first kernel mapped page is always reserved.
+ * Keep untouched (zero) bounds for faster lookups.
+ */
+ BUG_ON(curr < 1 || curr >= GREEDY_ARNUM);
+ greedy[curr] = p;
+
+ /* First time initialization. */
+ if (min < 0) {
+ min = start = end = max = curr;
+ } else {
+ /* Update bounds for faster pages return. */
+ if (min > curr)
+ min = curr;
+ if (max < curr)
+ max = curr;
+ /* Update continous memory segment bounds. */
+ if (curr == end + 1) {
+ while (end <= max && greedy[end + 1])
+ ++end;
+ }
+ else if (curr + 1 == start) {
+ while (start >= min && greedy[start - 1])
+ --start;
+ }
+ else {
+ /* Try to find new continous segment. */
+ long i, d_max = 0, good_start = start = min;
+ for (i = min; i <= max; ++i) {
+ if (greedy[i]) {
+ if (start == -1)
+ start = i;
+ end = i;
+ if (i - start + 1 == PGNUM)
+ break;
+ continue;
+ }
+
+ if (start > 0 && end - start > d_max) {
+ good_start = start;
+ d_max = end - start;
+ }
+ start = -1;
+ }
+ if (end - start < d_max) {
+ start = good_start;
+ end = start + d_max;
+ }
+ }
+ }
+
+ if (end - start + 1 == PGNUM)
+ break; /* continous space is built! */
+ }
+
+ /* Return unnecessary pages. */
+ BUG_ON(min < 0 || start < 0 || end < 0 || max < 0);
+ for ( ; min < start; ++min)
+ if (greedy[min]) {
+ tempesta_free_hpage(greedy[min]);
+ greedy[min] = NULL;
+ }
+ for ( ; max > end; --max)
+ if (greedy[max]) {
+ tempesta_free_hpage(greedy[max]);
+ greedy[max] = NULL;
+ }
+ return greedy[start];
+
+err:
+ pr_err("Tempesta: cannot allocate %u continous huge pages at node"
+ " %d\n", PGNUM, nid);
+ for ( ; min >= 0 && min <= max; ++min)
+ if (greedy[min]) {
+ tempesta_free_hpage(greedy[min]);
+ greedy[min] = NULL;
+ }
+ return NULL;
+}
+
+/**
+ * Allocate continous virtual space of huge pages for Tempesta.
+ * We do not use giantic 1GB pages since not all modern x86-64 CPUs
+ * allows them in virtualized mode.
+ *
+ * TODO try firstly to allocate giantic pages, next huge pages and finally
+ * fallback to common 4KB pages allocation if previous tries failed.
+ */
+void __init
+tempesta_reserve_pages(void)
+{
+ int nid;
+ struct page *p;
+
+ for_each_online_node(nid) {
+ p = tempesta_alloc_contmem(nid);
+ if (!p)
+ goto err;
+
+ map[nid].addr = (unsigned long)page_address(p);
+ map[nid].pages = PGNUM4K;