Add the basic framework for the new layer: object storage device. And…

… fix some bugs: 1. __aur_itb_split -> au_submit confilct with xfree; 2. mdsl_init -> mdsl_pre_init which zeroes valid fields;
macan · Sep 3, 2012 · c73550a · c73550a
1 parent c4b4475
commit c73550a
Show file tree

Hide file tree

Showing 83 changed files with 4,857 additions and 271 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 # Copyright (c) 2009 Ma Can <ml.macana@gmail.com>
 #                           <macan@ncic.ac.cn>
 #
-# Time-stamp: <2011-09-17 00:31:18 macan>
+# Time-stamp: <2012-08-10 13:57:59 macan>
 #
 # This is the makefile for HVFS project.
 #
@@ -18,62 +18,68 @@ RING_SOURCES = $(LIB_PATH)/ring.c $(LIB_PATH)/lib.c $(LIB_PATH)/hash.c \
 all : unit_test lib triggers
 
 $(HVFS_LIB) : $(lib_depend_files)
-	@echo -e " " CD"\t" $(LIB_PATH)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(LIB_PATH)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(LIB_PATH) -e "HOME_PATH=$(HOME_PATH)"
 
 $(MDS_LIB) : $(mds_depend_files)
-	@echo -e " " CD"\t" $(MDS)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(MDS)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(MDS) -e "HOME_PATH=$(HOME_PATH)"
 
 $(MDSL_LIB) : $(mdsl_depend_files)
-	@echo -e " " CD"\t" $(MDSL)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(MDSL)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(MDSL) -e "HOME_PATH=$(HOME_PATH)"
 
+$(OSD_LIB) : $(osd_depend_files)
+	@$(ECHO) -e " " CD"\t" $(OSD)
+	@$(ECHO) -e " " MK"\t" $@
+	@$(MAKE) --no-print-directory -C $(OSD) -e "HOME_PATH=$(HOME_PATH)"
+
 $(R2_LIB) : $(r2_depend_files)
-	@echo -e " " CD"\t" $(R2)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(R2)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(R2) -e "HOME_PATH=$(HOME_PATH)"
 
 $(XNET_LIB) : $(xnet_depend_files)
-	@echo -e " " CD"\t" $(XNET)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(XNET)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(XNET) -e "HOME_PATH=$(HOME_PATH)"
 
 $(API_LIB) : $(api_depend_files)
-	@echo -e " " CD"\t" $(API)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(API)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(API) -e "HOME_PATH=$(HOME_PATH)"
 
 $(BRANCH_LIB) : $(branch_depend_files)
-	@echo -e " " CD"\t" $(BRANCH)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(BRANCH)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(BRANCH) -e "HOME_PATH=$(HOME_PATH)"
 
 ifdef USE_FUSE
 $(FUSE_LIB) : $(fuse_depend_files)
-	@echo -e " " CD"\t" $(FUSE)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(FUSE)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(FUSE) -e "HOME_PATH=$(HOME_PATH)"
 else
 $(FUSE_LIB) : $(fuse_depend_files)
-	@echo -e " " MK"\t" $@ " (Ignored! Use 'USE_FUSE=1' to enable fuse support.)"
+	@$(ECHO) -e " " MK"\t" $@ " (Ignored! Use 'USE_FUSE=1' to enable fuse support.)"
 endif
 
 triggers : $(triggers_depend_files) build_triggers
-	@echo "Triggers' dynamic library are ready."
+	@$(ECHO) "Triggers' dynamic library are ready."
 
 build_triggers : 
-	@echo -e " " CD"\t" $(TRIGGERS)
-	@echo -e " " MK"\t" $@
+	@$(ECHO) -e " " CD"\t" $(TRIGGERS)
+	@$(ECHO) -e " " MK"\t" $@
 	@$(MAKE) --no-print-directory -C $(TRIGGERS) -e "HOME_PATH=$(HOME_PATH)"
 
 clean :
 	@$(MAKE) --no-print-directory -C $(LIB_PATH) -e "HOME_PATH=$(HOME_PATH)" clean
 	@$(MAKE) --no-print-directory -C $(MDS) -e "HOME_PATH=$(HOME_PATH)" clean
 	@$(MAKE) --no-print-directory -C $(MDSL) -e "HOME_PATH=$(HOME_PATH)" clean
+	@$(MAKE) --no-print-directory -C $(OSD) -e "HOME_PATH=$(HOME_PATH)" clean
 	@$(MAKE) --no-print-directory -C $(R2) -e "HOME_PATH=$(HOME_PATH)" clean
 	@$(MAKE) --no-print-directory -C $(API) -e "HOME_PATH=$(HOME_PATH)" clean
 	@$(MAKE) --no-print-directory -C $(BRANCH) -e "HOME_PATH=$(HOME_PATH)" clean
@@ -90,44 +96,44 @@ depclean:
 	@$(MAKE) --no-print-directory -C $(TEST)/result -e "HOME_PATH=$(HOME_PATH)" clean
 
 help :
-	@echo "Environment Variables:"
-	@echo ""
-	@echo "1. USE_BDB           if defined, compile w/ BerkeleyDB support;"
-	@echo "                     otherwise, use plain file."
-	@echo ""
-	@echo "2. DISABLE_PYTHON    if defined, do not compile w/ Python C API."
-	@echo "                     otherwise, compile and link with libpython."
-	@echo ""
-	@echo "3. JEMALLOC          Must defined w/ jemalloc install path prefix;"
-	@echo "                     otherwise, we can find the jemalloc lib path."
-	@echo ""
-	@echo "4. USE_FUSE          if defined, link with libfuse;"
-	@echo "                     otherwise, ignore fuse client."
-	@echo ""
-	@echo "5. PYTHON_INC        python include path"
-	@echo ""
-	@echo "6. BDB_HOME          BerkeleyDB install path prefix."
+	@$(ECHO) "Environment Variables:"
+	@$(ECHO) ""
+	@$(ECHO) "1. USE_BDB           if defined, compile w/ BerkeleyDB support;"
+	@$(ECHO) "                     otherwise, use plain file."
+	@$(ECHO) ""
+	@$(ECHO) "2. DISABLE_PYTHON    if defined, do not compile w/ Python C API."
+	@$(ECHO) "                     otherwise, compile and link with libpython."
+	@$(ECHO) ""
+	@$(ECHO) "3. JEMALLOC          Must defined w/ jemalloc install path prefix;"
+	@$(ECHO) "                     otherwise, we can find the jemalloc lib path."
+	@$(ECHO) ""
+	@$(ECHO) "4. USE_FUSE          if defined, link with libfuse;"
+	@$(ECHO) "                     otherwise, ignore fuse client."
+	@$(ECHO) ""
+	@$(ECHO) "5. PYTHON_INC        python include path"
+	@$(ECHO) ""
+	@$(ECHO) "6. BDB_HOME          BerkeleyDB install path prefix."
 
 # Note: the following region is only for UNIT TESTing
 # region for unit test
 $(LIB_PATH)/ring : $(RING_SOURCES)
-	@echo -e " " CC"\t" $@
+	@$(ECHO) -e " " CC"\t" $@
 	@$(CC) $(CFLAGS) $^ -o $@ -DUNIT_TEST
 
-lib : $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB)
-	@echo -e " " Lib is ready.
+lib : $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) $(OSD_LIB)
+	@$(ECHO) -e " " Lib is ready.
 
 unit_test : $(ut_depend_files) $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) \
-			$(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB)
-	@echo -e " " CD"\t" $(TEST)/mds
+			$(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) $(OSD_LIB)
+	@$(ECHO) -e " " CD"\t" $(TEST)/mds
 	@$(MAKE) --no-print-directory -C $(TEST)/mds -e "HOME_PATH=$(HOME_PATH)"
-	@echo -e " " CD"\t" $(TEST)/xnet
+	@$(ECHO) -e " " CD"\t" $(TEST)/xnet
 	@$(MAKE) --no-print-directory -C $(TEST)/xnet -e "HOME_PATH=$(HOME_PATH)"
-	@echo -e " " CD"\t" $(TEST)/mdsl
+	@$(ECHO) -e " " CD"\t" $(TEST)/mdsl
 	@$(MAKE) --no-print-directory -C $(TEST)/mdsl -e "HOME_PATH=$(HOME_PATH)"
-	@echo -e " " CD"\t" $(TEST)/fuse
+	@$(ECHO) -e " " CD"\t" $(TEST)/fuse
 	@$(MAKE) --no-print-directory -C $(TEST)/fuse -e "HOME_PATH=$(HOME_PATH)"
-	@echo "Targets for unit test are ready."
+	@$(ECHO) "Targets for unit test are ready."
 
 install: unit_test triggers
 	@rsync -r $(TEST)/*.sh root@glnode09:~/hvfs/test/
@@ -141,7 +147,7 @@ install: unit_test triggers
 	@rsync -r $(TEST)/fuse/*.ut root@glnode09:~/hvfs/test/fuse/
 	@rsync -r $(TEST)/bdb/* root@glnode09:~/hvfs/test/bdb/
 	@rsync -r $(TEST)/python/*.py root@glnode09:~/hvfs/test/python/
-	@echo "Install done."
+	@$(ECHO) "Install done."
 
 xinstall: unit_test
 	@rsync -r $(TEST)/*.sh root@10.10.104.1:/home/macan/test/
@@ -150,12 +156,12 @@ xinstall: unit_test
 	@rsync -r $(TEST)/mds/*.ut root@10.10.104.1:/home/macan/test/mds/
 	@rsync -r $(TEST)/xnet/*.ut root@10.10.104.1:/home/macan/test/xnet/
 	@rsync -r $(TEST)/mdsl/*.ut root@10.10.104.1:/home/macan/test/mdsl/
-	@echo "Install done."
+	@$(ECHO) "Install done."
 
 plot: 
-	@echo -e "Ploting ..."
+	@$(ECHO) -e "Ploting ..."
 	@$(MAKE) --no-print-directory -C $(TEST)/result -e "HOME_PATH=$(HOME_PATH)" plot
-	@echo -e "Done.\n"
+	@$(ECHO) -e "Done.\n"
 
 rut:
 	@lagent -d glnode09 -u root -sc "time ~/cbht $(CBHT_ARGS)"

diff --git a/Makefile.inc b/Makefile.inc
@@ -2,7 +2,7 @@
 # Copyright (c) 2009 Ma Can <ml.macana@gmail.com>
 #                           <macan@ncic.ac.cn>
 #
-# Time-stamp: <2012-05-22 09:40:38 macan>
+# Time-stamp: <2012-08-10 13:58:08 macan>
 #
 # This is the makefile for HVFS project.
 #
@@ -13,7 +13,7 @@ LD = gcc
 AR = ar
 PYTHON = env python
 GIT = env git
-ECHO = echo
+ECHO = /bin/echo
 
 INC_PATH = $(HOME_PATH)/include
 LIB_PATH = $(HOME_PATH)/lib
@@ -28,6 +28,7 @@ API = $(HOME_PATH)/api
 TRIGGERS = $(HOME_PATH)/triggers
 BRANCH = $(HOME_PATH)/branch
 FUSE = $(HOME_PATH)/fuse
+OSD = $(HOME_PATH)/osd
 
 ifdef USE_GIT_TAG
 GIT_SHA = `$(GIT) rev-parse HEAD`
@@ -82,7 +83,7 @@ CFLAGS += -Wall -DCDATE="\"$(COMPILE_DATE)\"" -DGIT_SHA="\"$(GIT_SHA)\""\
 			-DCHOST="\"$(COMPILE_HOST)\"" -I$(INC_PATH) \
 			-I$(PYTHON_INC) -I$(API) -I$(BRANCH) \
 			-I$(LIB_PATH) -I$(MDS) -I$(MDSL) -I$(R2) \
-			-I$(FUSE) -D_GNU_SOURCE \
+			-I$(FUSE) -I$(OSD) -D_GNU_SOURCE \
 			-DHVFS_TRACING -DHVFS_DEBUG_MEMORY -DHVFS_DEBUG_LOCK_ \
 			-D_USE_SPINLOCK_ -DHVFS_DEBUG_LATENCY_ -DXNET_BLOCKING \
 			-DXNET_EAGER_WRITEV -DCPU_CORE=$(__CORES__) \
@@ -125,11 +126,11 @@ LFLAGS += -lrt -ldl -lpthread
 # Region for depend files
 TEST_MDS_SOURCE = cbht.c tx.c dh.c cmd_sender.c misc.c itbsplit.c \
                   itb_analyzer.c bitmapc.c embedpy.c ctrigger.c \
-                  split_tracing.c
+                  split_tracing.c perr.c
 TEST_MDSL_SOURCE = mdsl.c storage.c gc.c txg_viewer.c gc_data.c \
                    afixer.c bulktest.c
 TEST_XNET_SOURCE = xnet.c mds.c fpmds.c m2m.c xs.c ausplit.c mdsl.c client.c \
-					root.c r2cli.c amc.c cr.c client_lat.c bp.c
+					root.c r2cli.c amc.c cr.c client_lat.c bp.c osd.c
 TEST_FUSE_SOURCE = xattr.c microbench.c dbsearch.c statis.c
 
 ifdef USE_FUSE
@@ -142,18 +143,19 @@ MDS_AR_SOURCE = itb.c mds.c txg.c cbht.c tx.c prof.c conf.c dh.c xtable.c \
                 ddc.c scrub.c gossip.c capi.c ft.c trigger.c redo.c
 MDSL_AR_SOURCE = mdsl.c spool.c tcc.c dispatch.c m2ml.c prof.c storage.c \
 				 aio.c c2ml.c local.c gc.c ml2ml.c
+OSD_AR_SOURCE = osd.c spool.c dispatch.c storage.c prof.c 
 LIB_AR_SOURCE = lib.c ring.c time.c bitmap.c xlock.c segv.c conf.c md5.c \
-                embedpy.c minilzo.c brtree.c
+                embedpy.c minilzo.c brtree.c crc32.c
 XNET_AR_SOURCE = xnet.c xnet_simple.c
 R2_AR_SOURCE = mgr.c root.c spool.c x2r.c dispatch.c bparser.c cli.c \
-               profile.c
+               profile.c om.c
 API_AR_SOURCE = api.c
 BRANCH_AR_SOURCE = branch.c bp.c bdb.c
 
 INC_H_SOURCE = atomic.h err.h hvfs.h hvfs_common.h hvfs_const.h hvfs_k.h \
 				hvfs_u.h ite.h mds_api.h mdsl_api.h memory.h site.h tx.h \
 				tracing.h txg.h xhash.h xlist.h xlock.h xnet.h xtable.h \
-				xprof.h hvfs_addr.h profile.h
+				xprof.h hvfs_addr.h profile.h obj.h
 MDS_H_SOURCE = mds.h cbht.h dh.h itb.h prof.h async.h bitmapc.h mds_config.h \
 				ft.h redo.h
 MDSL_H_SOURCE = mdsl.h lprof.h mdsl_config.h
@@ -162,12 +164,14 @@ LIB_H_SOURCE = lib.h ring.h minilzo.h
 API_H_SOURCE = 
 BRANCH_H_SOURCE = branch.h bp.h bdb_dummy.h
 FUSE_H_SOURCE = pfs.h store.h
+OSD_H_SOURCE = osd.h osd_config.h lprof.h
 
 inc_h_depend_files = $(patsubst %.h, $(INC_PATH)/%.h, $(INC_H_SOURCE)) \
 						$(LIB_PATH)/hash.c
 mds_h_depend_files = $(patsubst %.h, $(MDS)/%.h, $(MDS_H_SOURCE)) \
                         $(MDS)/latency.c
 mdsl_h_depend_fils = $(patsubst %.h, $(MDSL)/%.h, $(MDSL_H_SOURCE))
+osd_h_depend_fils = $(patsubst %.h, $(OSD)/%.h, $(OSD_H_SOURCE))
 lib_h_depend_files = $(patsubst %.h, $(LIB_PATH)/%.h, $(LIB_H_SOURCE))
 r2_h_depend_files = $(patsubst %.h, $(R2)/%.h, $(R2_H_SOURCE))
 api_h_depend_files = $(patsubst %.h, $(API)/%.h, $(API_H_SOURCE))
@@ -186,6 +190,8 @@ mds_depend_files = $(patsubst %.c, $(MDS)/%.c, $(MDS_AR_SOURCE)) \
 					$(header_depend_files)
 mdsl_depend_files = $(patsubst %.c, $(MDSL)/%.c, $(MDSL_AR_SOURCE)) \
 					$(header_depend_files)
+osd_depend_files = $(patsubst %.c, $(OSD)/%.c, $(OSD_AR_SOURCE)) \
+					$(header_depend_files)
 lib_depend_files = $(patsubst %.c, $(LIB_PATH)/%.c, $(LIB_AR_SOURCE)) \
 					$(header_depend_files)
 xnet_depend_files = $(patsubst %.c, $(XNET)/%.c, $(XNET_AR_SOURCE)) \
@@ -208,6 +214,7 @@ LIB_SO = $(LIB_PATH)/libhvfs.so.1.0
 XNET_SO = $(XNET)/libxnet.so.1.0
 MDS_SO = $(MDS)/libmds.so.1.0
 MDSL_SO = $(MDSL)/libmdsl.so.1.0
+OSD_SO = $(OSD)/libosd.so.1.0
 R2_SO = $(R2)/libr2.so.1.0
 API_SO = $(API)/libapi.so.1.0
 BRANCH_SO = $(BRANCH)/libbranch.so.1.0
@@ -220,3 +227,4 @@ XNET_LIB = $(XNET)/libxnet.a
 API_LIB = $(API)/libapi.a
 BRANCH_LIB = $(BRANCH)/libbranch.a
 FUSE_LIB = $(FUSE)/libpfuse.a
+OSD_LIB = $(OSD)/libosd.a
diff --git a/README.markdown b/README.markdown
@@ -0,0 +1,91 @@
+# Pomegranate File System Documentation
+
+<a href="http://github.com/macan/Pomegranate"><img src="https://github.com/macan/macan.github.com/raw/master/png/Pomegranate_logo.png" /></a>
+
+It **is** a distributed file system, but **not only** a file system!
+
+[Wiki Page](http://github.com/macan/Pomegranate/wiki)
+
+## Introduction
+
+Pomegranate File System (abbr. PFS) is originally proposed for large scale
+small file access. It contains many optimizations for small objects.
+
+* Automatic small file aggregation based on file system directory
+* Tabular directory model, support metadata deduplication
+* Automatic migrating file creations in a cluster
+* Metadata store and small file data store is designed for flash device
+* Support POSIX, REST interface
+* Has C/Python bindings
+
+### Architecture
+
+To exploit fast storage devices to accelerate small file performace, e.g. SSD,
+PFS adopts a 3-tier storage architecture. 
+
+The first tier is **memory caching** layer, which is used for metadata caching
+to reduce metadata latency. Metadata latency has significant impacts on small
+file I/O latency. Decreasing metadata latency can efficient improve the small
+file performace.
+
+The second tier is **flash caching** layer, which is used for durability of
+metadata and small data. Flash device has lower I/O latency. Thus, it is
+suitable for small data access.
+
+The third tier is **disk store** layer, which is designed for longer
+durability of all data. It use data replication for data reliability and
+deduplication for efficient space consumption.
+
+### Tabular Directory Model
+
+In many Web 2.0 applications, objects (e.g. photos, videos, docs, ...) are
+saved in several different forms. For example, in a photo gallery web site,
+photoes that updated by users are transformed to several resolutions. These
+different object forms that derived from the same (original) object contains
+almost the same metadata. Thus, if we save these different forms into
+different files, then we would have many metadata duplication in distributed
+file system. We define this issue as **N-Form** issue.
+
+To overcome the above N-Forms issue, we propose to introduce powerful
+directory model to traditional file system. In PFS, we use tabular directory
+model to keep file system metadata. With one file name, users can save many
+different object forms in different columns' cells. File metadata is a special
+table column of the directory table.
+
+By adopting tabular directory model, the metadata duplication of N-Form issue
+can be overcomed. Besides this benefit, the new directory model grouped the
+file data which has the same property or usage purpose in the same
+column. Thus, we can do more efficient file placements and aggregations.
+
+### File Aggregation
+
+In Web 2.0 applications, objects are mainly in small size. For example, social
+network web pages contain many small sized photoes and short video
+segments. The typical size of these objects are less than 10MB. Many
+traditional distributed file systems are designed for HPC applications, which
+targets at large file I/O optimization. Thus, for small files, many of these
+I/O optimizations are **not** as efficient as that for large files.
+
+To optimize small file I/O, we propose to do file aggregation based on tabular
+directory model. For files that in the same directory, we do file aggregations
+automatically. For each directory column, we generate an aggregated large
+file. File content is cached and then write sequentially to low level
+SSD. File aggregation can maximally utilize low level I/O bandwidth.
+
+### Extendible Metadata Service
+
+There are so many objects to store in Web 2.0 applications. User generated
+objects, such as uploaded photoes, videos, documents, are tremendous. To
+manage these massive objects in a file system means that we need a expandable
+metadata service.
+
+In PFS, we exploid the extendible hash technology to distribute file metadata
+across many cache servers. Metadata can migrate from one server to other
+server when there are too many cached file entries. The cache server can be
+add in or remove out at any time with little latency. File metadata is
+redistributed automatically on server changes.
+
+## Development Cycle
+
+A new OBJECT STORE LAYER for large files is under developing.
+