diff --git a/Makefile b/Makefile index b529e77..351f5c6 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-09-17 00:31:18 macan> +# Time-stamp: <2012-08-10 13:57:59 macan> # # This is the makefile for HVFS project. # @@ -18,62 +18,68 @@ RING_SOURCES = $(LIB_PATH)/ring.c $(LIB_PATH)/lib.c $(LIB_PATH)/hash.c \ all : unit_test lib triggers $(HVFS_LIB) : $(lib_depend_files) - @echo -e " " CD"\t" $(LIB_PATH) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(LIB_PATH) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(LIB_PATH) -e "HOME_PATH=$(HOME_PATH)" $(MDS_LIB) : $(mds_depend_files) - @echo -e " " CD"\t" $(MDS) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(MDS) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(MDS) -e "HOME_PATH=$(HOME_PATH)" $(MDSL_LIB) : $(mdsl_depend_files) - @echo -e " " CD"\t" $(MDSL) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(MDSL) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(MDSL) -e "HOME_PATH=$(HOME_PATH)" +$(OSD_LIB) : $(osd_depend_files) + @$(ECHO) -e " " CD"\t" $(OSD) + @$(ECHO) -e " " MK"\t" $@ + @$(MAKE) --no-print-directory -C $(OSD) -e "HOME_PATH=$(HOME_PATH)" + $(R2_LIB) : $(r2_depend_files) - @echo -e " " CD"\t" $(R2) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(R2) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(R2) -e "HOME_PATH=$(HOME_PATH)" $(XNET_LIB) : $(xnet_depend_files) - @echo -e " " CD"\t" $(XNET) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(XNET) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(XNET) -e "HOME_PATH=$(HOME_PATH)" $(API_LIB) : $(api_depend_files) - @echo -e " " CD"\t" $(API) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(API) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(API) -e "HOME_PATH=$(HOME_PATH)" $(BRANCH_LIB) : $(branch_depend_files) - @echo -e " " CD"\t" $(BRANCH) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(BRANCH) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(BRANCH) -e "HOME_PATH=$(HOME_PATH)" ifdef USE_FUSE $(FUSE_LIB) : $(fuse_depend_files) - @echo -e " " CD"\t" $(FUSE) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(FUSE) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(FUSE) -e "HOME_PATH=$(HOME_PATH)" else $(FUSE_LIB) : $(fuse_depend_files) - @echo -e " " MK"\t" $@ " (Ignored! Use 'USE_FUSE=1' to enable fuse support.)" + @$(ECHO) -e " " MK"\t" $@ " (Ignored! Use 'USE_FUSE=1' to enable fuse support.)" endif triggers : $(triggers_depend_files) build_triggers - @echo "Triggers' dynamic library are ready." + @$(ECHO) "Triggers' dynamic library are ready." build_triggers : - @echo -e " " CD"\t" $(TRIGGERS) - @echo -e " " MK"\t" $@ + @$(ECHO) -e " " CD"\t" $(TRIGGERS) + @$(ECHO) -e " " MK"\t" $@ @$(MAKE) --no-print-directory -C $(TRIGGERS) -e "HOME_PATH=$(HOME_PATH)" clean : @$(MAKE) --no-print-directory -C $(LIB_PATH) -e "HOME_PATH=$(HOME_PATH)" clean @$(MAKE) --no-print-directory -C $(MDS) -e "HOME_PATH=$(HOME_PATH)" clean @$(MAKE) --no-print-directory -C $(MDSL) -e "HOME_PATH=$(HOME_PATH)" clean + @$(MAKE) --no-print-directory -C $(OSD) -e "HOME_PATH=$(HOME_PATH)" clean @$(MAKE) --no-print-directory -C $(R2) -e "HOME_PATH=$(HOME_PATH)" clean @$(MAKE) --no-print-directory -C $(API) -e "HOME_PATH=$(HOME_PATH)" clean @$(MAKE) --no-print-directory -C $(BRANCH) -e "HOME_PATH=$(HOME_PATH)" clean @@ -90,44 +96,44 @@ depclean: @$(MAKE) --no-print-directory -C $(TEST)/result -e "HOME_PATH=$(HOME_PATH)" clean help : - @echo "Environment Variables:" - @echo "" - @echo "1. USE_BDB if defined, compile w/ BerkeleyDB support;" - @echo " otherwise, use plain file." - @echo "" - @echo "2. DISABLE_PYTHON if defined, do not compile w/ Python C API." - @echo " otherwise, compile and link with libpython." - @echo "" - @echo "3. JEMALLOC Must defined w/ jemalloc install path prefix;" - @echo " otherwise, we can find the jemalloc lib path." - @echo "" - @echo "4. USE_FUSE if defined, link with libfuse;" - @echo " otherwise, ignore fuse client." - @echo "" - @echo "5. PYTHON_INC python include path" - @echo "" - @echo "6. BDB_HOME BerkeleyDB install path prefix." + @$(ECHO) "Environment Variables:" + @$(ECHO) "" + @$(ECHO) "1. USE_BDB if defined, compile w/ BerkeleyDB support;" + @$(ECHO) " otherwise, use plain file." + @$(ECHO) "" + @$(ECHO) "2. DISABLE_PYTHON if defined, do not compile w/ Python C API." + @$(ECHO) " otherwise, compile and link with libpython." + @$(ECHO) "" + @$(ECHO) "3. JEMALLOC Must defined w/ jemalloc install path prefix;" + @$(ECHO) " otherwise, we can find the jemalloc lib path." + @$(ECHO) "" + @$(ECHO) "4. USE_FUSE if defined, link with libfuse;" + @$(ECHO) " otherwise, ignore fuse client." + @$(ECHO) "" + @$(ECHO) "5. PYTHON_INC python include path" + @$(ECHO) "" + @$(ECHO) "6. BDB_HOME BerkeleyDB install path prefix." # Note: the following region is only for UNIT TESTing # region for unit test $(LIB_PATH)/ring : $(RING_SOURCES) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $^ -o $@ -DUNIT_TEST -lib : $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) - @echo -e " " Lib is ready. +lib : $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) $(OSD_LIB) + @$(ECHO) -e " " Lib is ready. unit_test : $(ut_depend_files) $(HVFS_LIB) $(MDS_LIB) $(XNET_LIB) \ - $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) - @echo -e " " CD"\t" $(TEST)/mds + $(MDSL_LIB) $(R2_LIB) $(API_LIB) $(BRANCH_LIB) $(FUSE_LIB) $(OSD_LIB) + @$(ECHO) -e " " CD"\t" $(TEST)/mds @$(MAKE) --no-print-directory -C $(TEST)/mds -e "HOME_PATH=$(HOME_PATH)" - @echo -e " " CD"\t" $(TEST)/xnet + @$(ECHO) -e " " CD"\t" $(TEST)/xnet @$(MAKE) --no-print-directory -C $(TEST)/xnet -e "HOME_PATH=$(HOME_PATH)" - @echo -e " " CD"\t" $(TEST)/mdsl + @$(ECHO) -e " " CD"\t" $(TEST)/mdsl @$(MAKE) --no-print-directory -C $(TEST)/mdsl -e "HOME_PATH=$(HOME_PATH)" - @echo -e " " CD"\t" $(TEST)/fuse + @$(ECHO) -e " " CD"\t" $(TEST)/fuse @$(MAKE) --no-print-directory -C $(TEST)/fuse -e "HOME_PATH=$(HOME_PATH)" - @echo "Targets for unit test are ready." + @$(ECHO) "Targets for unit test are ready." install: unit_test triggers @rsync -r $(TEST)/*.sh root@glnode09:~/hvfs/test/ @@ -141,7 +147,7 @@ install: unit_test triggers @rsync -r $(TEST)/fuse/*.ut root@glnode09:~/hvfs/test/fuse/ @rsync -r $(TEST)/bdb/* root@glnode09:~/hvfs/test/bdb/ @rsync -r $(TEST)/python/*.py root@glnode09:~/hvfs/test/python/ - @echo "Install done." + @$(ECHO) "Install done." xinstall: unit_test @rsync -r $(TEST)/*.sh root@10.10.104.1:/home/macan/test/ @@ -150,12 +156,12 @@ xinstall: unit_test @rsync -r $(TEST)/mds/*.ut root@10.10.104.1:/home/macan/test/mds/ @rsync -r $(TEST)/xnet/*.ut root@10.10.104.1:/home/macan/test/xnet/ @rsync -r $(TEST)/mdsl/*.ut root@10.10.104.1:/home/macan/test/mdsl/ - @echo "Install done." + @$(ECHO) "Install done." plot: - @echo -e "Ploting ..." + @$(ECHO) -e "Ploting ..." @$(MAKE) --no-print-directory -C $(TEST)/result -e "HOME_PATH=$(HOME_PATH)" plot - @echo -e "Done.\n" + @$(ECHO) -e "Done.\n" rut: @lagent -d glnode09 -u root -sc "time ~/cbht $(CBHT_ARGS)" diff --git a/Makefile.inc b/Makefile.inc index a97d197..a8895c6 100644 --- a/Makefile.inc +++ b/Makefile.inc @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2012-05-22 09:40:38 macan> +# Time-stamp: <2012-08-10 13:58:08 macan> # # This is the makefile for HVFS project. # @@ -13,7 +13,7 @@ LD = gcc AR = ar PYTHON = env python GIT = env git -ECHO = echo +ECHO = /bin/echo INC_PATH = $(HOME_PATH)/include LIB_PATH = $(HOME_PATH)/lib @@ -28,6 +28,7 @@ API = $(HOME_PATH)/api TRIGGERS = $(HOME_PATH)/triggers BRANCH = $(HOME_PATH)/branch FUSE = $(HOME_PATH)/fuse +OSD = $(HOME_PATH)/osd ifdef USE_GIT_TAG GIT_SHA = `$(GIT) rev-parse HEAD` @@ -82,7 +83,7 @@ CFLAGS += -Wall -DCDATE="\"$(COMPILE_DATE)\"" -DGIT_SHA="\"$(GIT_SHA)\""\ -DCHOST="\"$(COMPILE_HOST)\"" -I$(INC_PATH) \ -I$(PYTHON_INC) -I$(API) -I$(BRANCH) \ -I$(LIB_PATH) -I$(MDS) -I$(MDSL) -I$(R2) \ - -I$(FUSE) -D_GNU_SOURCE \ + -I$(FUSE) -I$(OSD) -D_GNU_SOURCE \ -DHVFS_TRACING -DHVFS_DEBUG_MEMORY -DHVFS_DEBUG_LOCK_ \ -D_USE_SPINLOCK_ -DHVFS_DEBUG_LATENCY_ -DXNET_BLOCKING \ -DXNET_EAGER_WRITEV -DCPU_CORE=$(__CORES__) \ @@ -125,11 +126,11 @@ LFLAGS += -lrt -ldl -lpthread # Region for depend files TEST_MDS_SOURCE = cbht.c tx.c dh.c cmd_sender.c misc.c itbsplit.c \ itb_analyzer.c bitmapc.c embedpy.c ctrigger.c \ - split_tracing.c + split_tracing.c perr.c TEST_MDSL_SOURCE = mdsl.c storage.c gc.c txg_viewer.c gc_data.c \ afixer.c bulktest.c TEST_XNET_SOURCE = xnet.c mds.c fpmds.c m2m.c xs.c ausplit.c mdsl.c client.c \ - root.c r2cli.c amc.c cr.c client_lat.c bp.c + root.c r2cli.c amc.c cr.c client_lat.c bp.c osd.c TEST_FUSE_SOURCE = xattr.c microbench.c dbsearch.c statis.c ifdef USE_FUSE @@ -142,18 +143,19 @@ MDS_AR_SOURCE = itb.c mds.c txg.c cbht.c tx.c prof.c conf.c dh.c xtable.c \ ddc.c scrub.c gossip.c capi.c ft.c trigger.c redo.c MDSL_AR_SOURCE = mdsl.c spool.c tcc.c dispatch.c m2ml.c prof.c storage.c \ aio.c c2ml.c local.c gc.c ml2ml.c +OSD_AR_SOURCE = osd.c spool.c dispatch.c storage.c prof.c LIB_AR_SOURCE = lib.c ring.c time.c bitmap.c xlock.c segv.c conf.c md5.c \ - embedpy.c minilzo.c brtree.c + embedpy.c minilzo.c brtree.c crc32.c XNET_AR_SOURCE = xnet.c xnet_simple.c R2_AR_SOURCE = mgr.c root.c spool.c x2r.c dispatch.c bparser.c cli.c \ - profile.c + profile.c om.c API_AR_SOURCE = api.c BRANCH_AR_SOURCE = branch.c bp.c bdb.c INC_H_SOURCE = atomic.h err.h hvfs.h hvfs_common.h hvfs_const.h hvfs_k.h \ hvfs_u.h ite.h mds_api.h mdsl_api.h memory.h site.h tx.h \ tracing.h txg.h xhash.h xlist.h xlock.h xnet.h xtable.h \ - xprof.h hvfs_addr.h profile.h + xprof.h hvfs_addr.h profile.h obj.h MDS_H_SOURCE = mds.h cbht.h dh.h itb.h prof.h async.h bitmapc.h mds_config.h \ ft.h redo.h MDSL_H_SOURCE = mdsl.h lprof.h mdsl_config.h @@ -162,12 +164,14 @@ LIB_H_SOURCE = lib.h ring.h minilzo.h API_H_SOURCE = BRANCH_H_SOURCE = branch.h bp.h bdb_dummy.h FUSE_H_SOURCE = pfs.h store.h +OSD_H_SOURCE = osd.h osd_config.h lprof.h inc_h_depend_files = $(patsubst %.h, $(INC_PATH)/%.h, $(INC_H_SOURCE)) \ $(LIB_PATH)/hash.c mds_h_depend_files = $(patsubst %.h, $(MDS)/%.h, $(MDS_H_SOURCE)) \ $(MDS)/latency.c mdsl_h_depend_fils = $(patsubst %.h, $(MDSL)/%.h, $(MDSL_H_SOURCE)) +osd_h_depend_fils = $(patsubst %.h, $(OSD)/%.h, $(OSD_H_SOURCE)) lib_h_depend_files = $(patsubst %.h, $(LIB_PATH)/%.h, $(LIB_H_SOURCE)) r2_h_depend_files = $(patsubst %.h, $(R2)/%.h, $(R2_H_SOURCE)) api_h_depend_files = $(patsubst %.h, $(API)/%.h, $(API_H_SOURCE)) @@ -186,6 +190,8 @@ mds_depend_files = $(patsubst %.c, $(MDS)/%.c, $(MDS_AR_SOURCE)) \ $(header_depend_files) mdsl_depend_files = $(patsubst %.c, $(MDSL)/%.c, $(MDSL_AR_SOURCE)) \ $(header_depend_files) +osd_depend_files = $(patsubst %.c, $(OSD)/%.c, $(OSD_AR_SOURCE)) \ + $(header_depend_files) lib_depend_files = $(patsubst %.c, $(LIB_PATH)/%.c, $(LIB_AR_SOURCE)) \ $(header_depend_files) xnet_depend_files = $(patsubst %.c, $(XNET)/%.c, $(XNET_AR_SOURCE)) \ @@ -208,6 +214,7 @@ LIB_SO = $(LIB_PATH)/libhvfs.so.1.0 XNET_SO = $(XNET)/libxnet.so.1.0 MDS_SO = $(MDS)/libmds.so.1.0 MDSL_SO = $(MDSL)/libmdsl.so.1.0 +OSD_SO = $(OSD)/libosd.so.1.0 R2_SO = $(R2)/libr2.so.1.0 API_SO = $(API)/libapi.so.1.0 BRANCH_SO = $(BRANCH)/libbranch.so.1.0 @@ -220,3 +227,4 @@ XNET_LIB = $(XNET)/libxnet.a API_LIB = $(API)/libapi.a BRANCH_LIB = $(BRANCH)/libbranch.a FUSE_LIB = $(FUSE)/libpfuse.a +OSD_LIB = $(OSD)/libosd.a \ No newline at end of file diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..fbd9be3 --- /dev/null +++ b/README.markdown @@ -0,0 +1,91 @@ +# Pomegranate File System Documentation + + + +It **is** a distributed file system, but **not only** a file system! + +[Wiki Page](http://github.com/macan/Pomegranate/wiki) + +## Introduction + +Pomegranate File System (abbr. PFS) is originally proposed for large scale +small file access. It contains many optimizations for small objects. + +* Automatic small file aggregation based on file system directory +* Tabular directory model, support metadata deduplication +* Automatic migrating file creations in a cluster +* Metadata store and small file data store is designed for flash device +* Support POSIX, REST interface +* Has C/Python bindings + +### Architecture + +To exploit fast storage devices to accelerate small file performace, e.g. SSD, +PFS adopts a 3-tier storage architecture. + +The first tier is **memory caching** layer, which is used for metadata caching +to reduce metadata latency. Metadata latency has significant impacts on small +file I/O latency. Decreasing metadata latency can efficient improve the small +file performace. + +The second tier is **flash caching** layer, which is used for durability of +metadata and small data. Flash device has lower I/O latency. Thus, it is +suitable for small data access. + +The third tier is **disk store** layer, which is designed for longer +durability of all data. It use data replication for data reliability and +deduplication for efficient space consumption. + +### Tabular Directory Model + +In many Web 2.0 applications, objects (e.g. photos, videos, docs, ...) are +saved in several different forms. For example, in a photo gallery web site, +photoes that updated by users are transformed to several resolutions. These +different object forms that derived from the same (original) object contains +almost the same metadata. Thus, if we save these different forms into +different files, then we would have many metadata duplication in distributed +file system. We define this issue as **N-Form** issue. + +To overcome the above N-Forms issue, we propose to introduce powerful +directory model to traditional file system. In PFS, we use tabular directory +model to keep file system metadata. With one file name, users can save many +different object forms in different columns' cells. File metadata is a special +table column of the directory table. + +By adopting tabular directory model, the metadata duplication of N-Form issue +can be overcomed. Besides this benefit, the new directory model grouped the +file data which has the same property or usage purpose in the same +column. Thus, we can do more efficient file placements and aggregations. + +### File Aggregation + +In Web 2.0 applications, objects are mainly in small size. For example, social +network web pages contain many small sized photoes and short video +segments. The typical size of these objects are less than 10MB. Many +traditional distributed file systems are designed for HPC applications, which +targets at large file I/O optimization. Thus, for small files, many of these +I/O optimizations are **not** as efficient as that for large files. + +To optimize small file I/O, we propose to do file aggregation based on tabular +directory model. For files that in the same directory, we do file aggregations +automatically. For each directory column, we generate an aggregated large +file. File content is cached and then write sequentially to low level +SSD. File aggregation can maximally utilize low level I/O bandwidth. + +### Extendible Metadata Service + +There are so many objects to store in Web 2.0 applications. User generated +objects, such as uploaded photoes, videos, documents, are tremendous. To +manage these massive objects in a file system means that we need a expandable +metadata service. + +In PFS, we exploid the extendible hash technology to distribute file metadata +across many cache servers. Metadata can migrate from one server to other +server when there are too many cached file entries. The cache server can be +add in or remove out at any time with little latency. File metadata is +redistributed automatically on server changes. + +## Development Cycle + +A new OBJECT STORE LAYER for large files is under developing. + diff --git a/api/Makefile b/api/Makefile index 0247da3..3145b6c 100644 --- a/api/Makefile +++ b/api/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-07-21 19:07:27 macan> +# Time-stamp: <2012-08-10 13:54:58 macan> # # This is the makefile for HVFS project. # @@ -13,14 +13,14 @@ include ../Makefile.inc all : api_so %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -DUSE_XNET_SIMPLE -c \ $(patsubst %.c, $(API)/%.c, $<) api_so : $(API_AR_SOURCE:.c=.o) $(XNET_LIB) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libapi.a $(^:.c=.o) - @echo -e " " SL"\t" $(API_SO) + @$(ECHO) -e " " SL"\t" $(API_SO) @$(CC) -L$(XNET) -shared -Wl,-soname,libapi.so.1 -o $(LIB_PATH)/libapi.so.1.0 $(^:.c=.o) -lc -lrt -lpthread -lxnet clean : diff --git a/api/api.c b/api/api.c index 3a2a049..1d0996d 100644 --- a/api/api.c +++ b/api/api.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-16 17:21:48 macan> + * Time-stamp: <2012-08-10 15:20:04 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1009,9 +1009,9 @@ int __core_main(int argc, char *argv[]) }; int err = 0; int self = -1, sport = -1; - int thread = 1; + int __UNUSED__ thread = 1; int fsid = 1; /* default to fsid 1 for kv store */ - int use_branch = 0; + int __UNUSED__ use_branch = 0; int loop_reg = 0; char *r2_ip = NULL; char *type = NULL; @@ -3616,7 +3616,7 @@ char *hvfs_active_site(char *type) int hvfs_active_site_size(char *type) { struct xnet_group *xg = NULL; - u64 base; + u64 __UNUSED__ base; int err = 0; if (strncmp(type, "mdsl", 4) == 0) { diff --git a/bin/conf_gen.py b/bin/conf_gen.py index 1e0bd15..fd52adf 100755 --- a/bin/conf_gen.py +++ b/bin/conf_gen.py @@ -43,6 +43,12 @@ BP_IP_SUFFIX = os.getenv("BP_IP_SUFFIX", None) BP_PORT = os.getenv("BP_PORT", "7900") +# D. osd ip prefix +OSD_IP_PREFIX = os.getenv("OSD_IP_PREFIX", DEFAULT_IP_PREFIX) +# E. osd ip suffix set +OSD_IP_SUFFIX = os.getenv("OSD_IP_SUFFIX", None) +OSD_PORT = os.getenv("OSD_PORT", "7900") + def main(argv): # argv[1] is target file if len(argv) < 2: @@ -113,5 +119,15 @@ def main(argv): f.write(line + "\n") print line + # write osd region + if OSD_IP_SUFFIX != None: + sset = shlex.split(OSD_IP_SUFFIX) + id = 0 + for x in sset: + line = "osd:" + OSD_IP_PREFIX + x + ":" + OSD_PORT + ":" + str(id) + id += 1 + f.write(line + "\n") + print line + if __name__ == '__main__': main(sys.argv[0:]) diff --git a/bin/hvfs.sh b/bin/hvfs.sh index 1443a68..bf4834a 100755 --- a/bin/hvfs.sh +++ b/bin/hvfs.sh @@ -3,7 +3,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2012-05-18 15:48:24 macan> +# Time-stamp: <2012-09-03 13:10:39 macan> # # This is the mangement script for Pomegranate # @@ -126,6 +126,27 @@ else fi fi +# Construct the osd command line +if [ -e $HVFS_HOME/conf/osd.conf ]; then + # Using the config file + if [ "x$MODE" == "xfs" ]; then + ARGS=`cat $HVFS_HOME/conf/osd.conf | grep -v "^ *#" | grep -v "^$" | grep -v "fsid="` + OSD_CMD="LOG_DIR=$LOG_DIR fsid=0 "`echo $ARGS` + elif [ "x$MODE" == "xkv" ]; then + ARGS=`cat $HVFS_HOME/conf/osd.conf | grep -v "^ *#" | grep -v "^$" | grep -v "fsid="` + OSD_CMD="LOG_DIR=$LOG_DIR fsid=1 "`echo $ARGS` + else + ARGS=`cat $HVFS_HOME/conf/osd.conf | grep -v "^ *#" | grep -v "^$"` + OSD_CMD="LOG_DIR=$LOG_DIR "`echo $ARGS` + fi +else + if [ "x$MODE" == "xfs" ]; then + OSD_CMD="fsid=0 mode=1 hvfs_osd_prof_plot=1 hvfs_osd_opt_write_drop=0" + else + OSD_CMD="fsid=1 mode=1 hvfs_osd_prof_plot=1 hvfs_osd_opt_write_drop=0" + fi +fi + CLIENT_CMD="" ipnr=`cat $CONFIG_FILE | grep "r2:" | awk -F: '{print $2":"$4}'` @@ -144,6 +165,12 @@ function adjust_syn() { $SSH $UN$ip "$SYSCTL_ADJ_SYN" > /dev/null & done echo "Adjust SYN on MDSL server done." + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4":"$3}'` + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + $SSH $UN$ip "$SYSCTL_ADJ_SYN" > /dev/null & + done + echo "Adjust SYN on OSD server done." ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4":"$3}'` for x in $ipnr; do ip=`echo $x | awk -F: '{print $1}'` @@ -174,6 +201,28 @@ function start_mdsl() { fi } +function start_osd() { + if [ "x$1" == "x" ]; then + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4":"$3}'` + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + port=`echo $x | awk -F: '{print $3}'` + $SSH $UN$ip "$OSD_CMD $HVFS_HOME/test/xnet/osd.ut $id $R2IP $port > $LOG_DIR/osd.$id.log" > /dev/null & + done + echo "Start OSD server done." + else + ipnr=`cat $CONFIG_FILE | grep "osd:.*:$1\$" | awk -F: '{print $2":"$4":"$3}'` + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + port=`echo $x | awk -F: '{print $3}'` + $SSH $UN$ip "$OSD_CMD $HVFS_HOME/test/xnet/osd.ut $id $R2IP $port > $LOG_DIR/osd.$id.log" > /dev/null & + echo "Start OSD server $id done." + done + fi +} + function start_mds() { if [ "x$1" == "x" ]; then ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4":"$3}'` @@ -276,6 +325,18 @@ function check_mdsl() { done } +function check_osd() { + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4}'` + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + R=`$SSH $UN$ip "cat $LOG_DIR/osd.$id.log | grep UP"` + if [ "x$R" == "x" ]; then + echo "OSD $id is not alive, please check it!" + fi + done +} + function check_mds() { ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4}'` for x in $ipnr; do @@ -317,6 +378,7 @@ function check_all() { check_root check_mds check_mdsl + check_osd } function stop_mdsl() { @@ -335,6 +397,22 @@ function stop_mdsl() { sleep 5 } +function stop_osd() { + if [ "x$1" == "x" ]; then + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4}'` + else + ipnr=`cat $CONFIG_FILE | grep "osd:.*:$1\$" | awk -F: '{print $2":"$4}'` + fi + + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + PID=`$SSH $UN$ip "ps aux" | grep "osd.ut $id" | grep -v bash | grep -v ssh | grep -v expect | grep -v grep` + $SSH $UN$ip "kill -s SIGHUP $PID 2>&1 > /dev/null" > /dev/null + done + sleep 5 +} + function stop_mds() { if [ "x$1" == "x" ]; then ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4}'` @@ -398,6 +476,22 @@ function kill_mdsl() { sleep 5 } +function kill_osd() { + if [ "x$1" == "x" ]; then + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4}'` + else + ipnr=`cat $CONFIG_FILE | grep "osd:.*:$1\$" | awk -F: '{print $2":"$4}'` + fi + + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + PID=`$SSH $UN$ip "ps aux" | grep "osd.ut $id" | grep -v bash | grep -v ssh | grep -v expect | grep -v grep` + $SSH $UN$ip "kill -9 $PID 2>&1 > /dev/null" > /dev/null + done + sleep 5 +} + function kill_mds() { if [ "x$1" == "x" ]; then ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4}'` @@ -449,15 +543,18 @@ function start_all() { start_root start_mdsl start_mds + start_osd } function stop_all() { + stop_osd stop_mds stop_mdsl stop_root } function kill_all() { + kill_osd kill_mdsl kill_mds kill_root @@ -471,6 +568,7 @@ function do_clean() { id=`echo $x | awk -F: '{print $2}'` $SSH $UN$ip "rm -rf /tmp/hvfs/4*" > /dev/null $SSH $UN$ip "rm -rf /tmp/hvfs/6*" > /dev/null + $SSH $UN$ip "rm -rf /tmp/hvfs/a*" > /dev/null $SSH $UN$ip "rm -rf /tmp/hvfs/bp" > /dev/null $SSH $UN$ip "rm -rf /tmp/hvfs/*_store" > /dev/null $SSH $UN$ip "rm -rf /tmp/.MDS.DCONF.*" > /dev/null @@ -492,6 +590,21 @@ function stat_mdsl() { done } +function stat_osd() { + echo "----------OSD-----------" + ipnr=`cat $CONFIG_FILE | grep "osd:" | awk -F: '{print $2":"$4}'` + for x in $ipnr; do + ip=`echo $x | awk -F: '{print $1}'` + id=`echo $x | awk -F: '{print $2}'` + NR=`$SSH $UN$ip "ps aux" | grep "osd.ut $id" | grep -v bash | grep -v ssh | grep -v expect | grep -v grep | wc -l` + if [ "x$NR" == "x1" ]; then + echo "OSD $id is running." + else + echo "OSD $id is gone." + fi + done +} + function stat_mds() { echo "----------MDS----------" ipnr=`cat $CONFIG_FILE | grep "mds:" | awk -F: '{print $2":"$4}'` @@ -598,6 +711,7 @@ function repeat_ut() { RND=1 while true; do + sleep 5 stat_client > rut.log RES=`cat rut.log | grep running` if [ "x$RES" == "x" ]; then @@ -609,7 +723,6 @@ function repeat_ut() { let RND+=1 do_ut fi - sleep 5 done rm -rf rut.log } @@ -618,6 +731,7 @@ function do_status() { echo "Checking servers' status ..." stat_mdsl stat_mds + stat_osd stat_root } @@ -781,14 +895,16 @@ function pview() { exec $SSH $UN$R2IP tail -f $LOG_DIR/CP-BACK-root.0.mds elif [ "x$1" == "xmdsl" ]; then exec $SSH $UN$R2IP tail -f $LOG_DIR/CP-BACK-root.0.mdsl + elif [ "x$1" == "xosd" ]; then + exec $SSH $UN$R2IP tail -f $LOG_DIR/CP-BACK-root.0.osd fi } function do_help() { - echo "Version 1.0.0b" - echo "Copyright (c) 2010 Can Ma " + echo "Version 1.0.1b" + echo "Copyright (c) 2010-2012 Can Ma " echo "" - echo "Usage: hvfs.sh [start|stop|kill|check] [mds|mdsl|r2|bp|all] [id]" + echo "Usage: hvfs.sh [start|stop|kill|check] [mds|mdsl|osd|r2|bp|all] [id]" echo " [clean|stat]" echo " [ut|kut|sut]" echo " [mount|umount|ml]" @@ -807,7 +923,8 @@ function do_help() { echo " mount mount the fuse client" echo " umount umount the fuse client" echo " ml list the mounted pfs entry" - echo " pview [mds|" + echo " pview [mds |" + echo " osd |" echo " mdsl] view the aggregated R2 log" echo "" echo "Environments:" @@ -842,6 +959,8 @@ if [ "x$1" == "xstart" ]; then start_mds $3 elif [ "x$2" == "xmdsl" ]; then start_mdsl $3 + elif [ "x$2" == "xosd" ]; then + start_osd $3 elif [ "x$2" == "xr2" ]; then start_root $3 elif [ "x$2" == "xbp" ]; then @@ -854,6 +973,8 @@ elif [ "x$1" == "xstop" ]; then stop_mds $3 elif [ "x$2" == "xmdsl" ]; then stop_mdsl $3 + elif [ "x$2" == "xosd" ]; then + stop_osd $3 elif [ "x$2" == "xr2" ]; then stop_root $3 elif [ "x$2" == "xbp" ]; then @@ -866,6 +987,8 @@ elif [ "x$1" == "xkill" ]; then kill_mds elif [ "x$2" == "xmdsl" ]; then kill_mdsl + elif [ "x$2" == "xosd" ]; then + kill_osd elif [ "x$2" == "xr2" ]; then kill_root elif [ "x$2" == "xbp" ]; then @@ -878,6 +1001,8 @@ elif [ "x$1" == "xcheck" ]; then check_mds elif [ "x$2" == "xmdsl" ]; then check_mdsl + elif [ "x$2" == "xosd" ]; then + check_osd elif [ "x$2" == "xr2" ]; then check_root elif [ "x$2" == "xbp" ]; then @@ -891,13 +1016,13 @@ elif [ "x$1" == "xut" ]; then do_ut while true; do + sleep 5 stat_client > ut.log RES=`cat ut.log | grep running` if [ "x$RES" == "x" ]; then gather_rps break; fi - sleep 5 done rm -rf ut.log elif [ "x$1" == "xkut" ]; then diff --git a/branch/Makefile b/branch/Makefile index 337c91f..0b661a7 100644 --- a/branch/Makefile +++ b/branch/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2012-05-22 11:03:59 macan> +# Time-stamp: <2012-08-10 13:58:38 macan> # # This is the makefile for HVFS project. # @@ -13,14 +13,14 @@ include ../Makefile.inc all : branch_so %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -DUSE_XNET_SIMPLE -c \ $(patsubst %.c, $(BRANCH)/%.c, $<) branch_so : $(BRANCH_AR_SOURCE:.c=.o) $(XNET_LIB) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libbranch.a $(^:.c=.o) - @echo -e " " SL"\t" $(BRANCH_SO) + @$(ECHO) -e " " SL"\t" $(BRANCH_SO) @$(CC) -shared -Wl,-soname,libbranch.so.1 -o $(LIB_PATH)/libbranch.so.1.0 $(^:.c=.o) -lc -lrt -lpthread $(BDBFLAGS) -L$(XNET) -lxnet clean : diff --git a/branch/branch.c b/branch/branch.c index 2de3339..1989c9a 100644 --- a/branch/branch.c +++ b/branch/branch.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-16 17:22:14 macan> + * Time-stamp: <2012-08-10 15:25:47 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -703,7 +703,7 @@ static inline struct xnet_group *__get_active_site(struct chring *r) { struct xnet_group *xg = NULL; - int i, err; + int i, __UNUSED__ err; for (i = 0; i < r->used; i++) { err = xnet_group_add(&xg, r->array[i].site_id); diff --git a/conf/mds.conf b/conf/mds.conf index 231eaa3..9f7e20c 100644 --- a/conf/mds.conf +++ b/conf/mds.conf @@ -53,7 +53,7 @@ hvfs_mds_mpcheck_sensitive=3 hvfs_mds_opt_mdzip=1 # Send plot info to R2: 0/1/2/3 => NONE/PLOT(default)/HUMAN/R2 -hvfs_mds_prof_plot=1 +hvfs_mds_prof_plot=3 # Redo replicas number -hvfs_mds_redo_replicas=2 +hvfs_mds_redo_replicas=1 diff --git a/conf/mdsl.conf b/conf/mdsl.conf index 4e3eb52..c61a698 100644 --- a/conf/mdsl.conf +++ b/conf/mdsl.conf @@ -12,7 +12,7 @@ fsid=0 mode=1 # Interval to do heartbeat (default 60s). -#hvfs_mds_hb_interval=10 +#hvfs_mdsl_hb_interval=10 # Drop all the write-backs (default disabled) hvfs_mdsl_opt_write_drop=0 diff --git a/conf/osd.conf b/conf/osd.conf new file mode 100644 index 0000000..cfbaa88 --- /dev/null +++ b/conf/osd.conf @@ -0,0 +1,16 @@ +# Pomegranate file system Metadata Server config file + +# File system id +# +# 0: used by the client.ut for unit test, you can use this id for fs access +# 1: used by key/value store +fsid=0 + +# Interval to do heartbeat (default 60s). +#hvfs_osd_hb_interval=10 + +# Drop all the write-backs (default disabled) +hvfs_osd_opt_write_drop=0 + +# Send plot info to R2: 0/1/2/3 => NONE/PLOT(default)/HUMAN/R2 +hvfs_osd_prof_plot=3 diff --git a/conf/single.conf b/conf/single.conf new file mode 100644 index 0000000..2cd8a03 --- /dev/null +++ b/conf/single.conf @@ -0,0 +1,13 @@ +# HVFS config file + +mds:127.0.0.1:8210:0 + +mdsl:127.0.0.1:8810:0 + +osd:127.0.0.1:9200:0 + +r2:127.0.0.1:8710:0 + +client:127.0.0.1:8412:0 + +amc:127.0.0.1:9001:0 diff --git a/conf/ut.conf b/conf/ut.conf index 1514ec4..6785ccf 100644 --- a/conf/ut.conf +++ b/conf/ut.conf @@ -24,4 +24,4 @@ entry=200000 # how many clients do you want to start? # if nr is -1, it means to start all the clients in conf/hvfs.conf file. -nr=-1 \ No newline at end of file +nr=-1 diff --git a/README b/doc/README similarity index 100% rename from README rename to doc/README diff --git a/fuse/Makefile b/fuse/Makefile index 4f3fd6e..fa71c40 100644 --- a/fuse/Makefile +++ b/fuse/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-06-21 11:41:30 macan> +# Time-stamp: <2012-08-10 13:58:52 macan> # # This is the makefile for HVFS project. # @@ -13,14 +13,14 @@ include ../Makefile.inc all : fuse_lib %.o : %.c $(fuse_h_depend_files) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(FUSE)/%.c, $<) fuse_lib : $(FUSE_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libpfuse.a $(^:.c=.o) clean : - -@echo "Clean FUSE client, be careful to add USE_FUSE=1 to make clean!" + -@$(ECHO) "Clean FUSE client, be careful to add USE_FUSE=1 to make clean!" -@rm -rf $(FUSE_AR_SOURCE:.c=.o) -@rm -rf libpfuse.a diff --git a/fuse/fc.c b/fuse/fc.c index 05b1f2a..55a442f 100644 --- a/fuse/fc.c +++ b/fuse/fc.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-21 10:20:14 macan> + * Time-stamp: <2012-08-10 15:30:09 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1618,7 +1618,7 @@ static int hvfs_mkdir(const char *pathname, mode_t mode) struct mdu_update mu; char *dup = strdup(pathname), *path, *name, *spath = NULL; char *p = NULL, *n, *s = NULL; - u64 puuid = hmi.root_uuid, psalt = hmi.root_salt, duuid; + u64 puuid = hmi.root_uuid, psalt = hmi.root_salt, __UNUSED__ duuid; u32 mdu_flags = 0; int err = 0; @@ -1733,8 +1733,9 @@ static int hvfs_unlink(const char *pathname) struct hstat hs = {0,}; char *dup = strdup(pathname), *path, *name, *spath = NULL; char *p = NULL, *n, *s = NULL; - u64 saved_puuid = hmi.root_uuid, saved_psalt = hmi.root_salt; - u64 saved_hash = 0; + u64 __UNUSED__ saved_puuid = hmi.root_uuid, + __UNUSED__ saved_psalt = hmi.root_salt, + __UNUSED__ saved_hash = 0; u64 puuid = hmi.root_uuid, psalt = hmi.root_salt; u32 mdu_flags = 0; int err = 0; @@ -1816,8 +1817,9 @@ static int hvfs_rmdir(const char *pathname) struct hstat hs = {0,}; char *dup = strdup(pathname), *path, *name, *spath = NULL; char *p = NULL, *n, *s = NULL; - u64 saved_puuid = hmi.root_uuid, saved_psalt = hmi.root_salt; - u64 saved_hash = 0; + u64 __UNUSED__ saved_puuid = hmi.root_uuid, + __UNUSED__ saved_psalt = hmi.root_salt, + __UNUSED__ saved_hash = 0; u64 puuid = hmi.root_uuid, psalt = hmi.root_salt; u32 mdu_flags = 0; int err = 0; @@ -2933,8 +2935,9 @@ static int hvfs_truncate(const char *pathname, off_t size) struct mdu_update mu = {.valid = 0,}; char *dup = strdup(pathname), *path, *name; char *p = NULL, *n, *s = NULL; - u64 saved_puuid = hmi.root_uuid, saved_psalt = hmi.root_salt; - u64 saved_hash = 0; + u64 __UNUSED__ saved_puuid = hmi.root_uuid, + __UNUSED__ saved_psalt = hmi.root_salt, + __UNUSED__ saved_hash = 0; u64 puuid = hmi.root_uuid, psalt = hmi.root_salt; ssize_t rlen; int err = 0; @@ -5329,9 +5332,9 @@ ssize_t __hvfs_xattr_tag_test(char *key, char *p, char **s, char *value, size_t size) { ssize_t err = 0; - char *B_name = NULL; - char *buf = NULL, *ukey = NULL; - u8 no_B = 0; + char __UNUSED__ *B_name = NULL; + char *buf = NULL, __UNUSED__ *ukey = NULL; + u8 __UNUSED__ no_B = 0; /* get B */ HVFS_XATTR_NT(key, p, s, err, out); @@ -5436,7 +5439,7 @@ ssize_t __hvfs_xattr_tag_search(char *key, char *p, char **s, ssize_t err = 0; char *B_name = NULL; char *buf = NULL, *dbname = NULL, *prefix = NULL, *sexpr = NULL; - u8 no_B = 0; + u8 __UNUSED__ no_B = 0; /* get B */ HVFS_XATTR_NT(key, p, s, err, out); diff --git a/include/hvfs.h b/include/hvfs.h index 58fffd7..f64fccb 100644 --- a/include/hvfs.h +++ b/include/hvfs.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-07-23 17:33:22 macan> + * Time-stamp: <2012-08-09 13:11:46 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -160,6 +160,7 @@ #define HVFS_R2_SHUTDOWN 0x0000000040000022 /* shutdown site */ #define HVFS_R2_PROFILE 0x0000000040000023 /* gather profile */ #define HVFS_R2_INFO 0x0000000040000024 /* get info */ +#define HVFS_R2_OREP 0x0000000040000025 /* object report */ /* ROOT/RING to * */ #define HVFS_FR2_RU 0x0000000041000000 /* ring updates to all @@ -167,6 +168,13 @@ #define HVFS_FR2_AU 0x0000000042000000 /* address table updates to * all sites */ +/* * to OSD */ +#define HVFS_OSD_READ 0x0000000010000001 /* object read */ +#define HVFS_OSD_WRITE 0x0000000010000002 /* object write */ +#define HVFS_OSD_SYNC 0x0000000010000003 /* object sync */ +#define HVFS_OSD_STATFS 0x0000000010000004 /* stat whole server */ +#define HVFS_OSD_QUERY 0x0000000010000005 /* query on specific object */ + /* APIs */ #define HASH_SEL_EH 0x00 #define HASH_SEL_CBHT 0x01 diff --git a/include/hvfs_addr.h b/include/hvfs_addr.h index c4f0bde..6df20b8 100644 --- a/include/hvfs_addr.h +++ b/include/hvfs_addr.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-03-02 13:12:22 macan> + * Time-stamp: <2012-08-14 09:59:36 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -150,6 +150,19 @@ struct hvfs_mdsl_info atomic64_t mi_bid; /* next branch id */ }; +struct hvfs_osd_info +{ + u32 state; + u32 group; + u64 gdt_salt; /* just a magic to test */ + u64 root_salt; /* just a magic to test */ + atomic64_t active; /* total active objects */ + atomic64_t mi_bused; /* used bytes */ + atomic64_t mi_bfree; /* free bytes */ + atomic64_t mi_bwrite; /* bytes totally written(for data) */ + atomic64_t mi_bread; /* bytes totally read(for data) */ +}; + /* * Note: we just saving the data region to the storage, ourself do not * interpret it. @@ -162,6 +175,7 @@ union hvfs_x_info struct hvfs_mdsl_info hmli; struct hvfs_client_info hci; struct hvfs_amc_info ami; + struct hvfs_osd_info hoi; }; /* please refer to r2/mgr.h struct root, this is a mirror of that structure */ diff --git a/include/hvfs_common.h b/include/hvfs_common.h index eb9263d..6f76e6d 100644 --- a/include/hvfs_common.h +++ b/include/hvfs_common.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-15 02:35:22 macan> + * Time-stamp: <2012-08-09 10:56:16 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -44,6 +44,14 @@ struct rename_reloc u64 psalt; /* the original psalt */ }; +/* the object info */ +struct obj_info +{ + u32 prefer_bs; /* prefered block size */ + u32 objnr; /* # of objs */ + u64 pad; +}; + /* the HVFS metadata unit */ struct mdu { @@ -96,6 +104,7 @@ struct mdu struct llfs_ref lr; char symname[16]; struct rename_reloc rr; + struct obj_info oi; }; }; diff --git a/include/hvfs_const.h b/include/hvfs_const.h index 797bdf2..77a9143 100644 --- a/include/hvfs_const.h +++ b/include/hvfs_const.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-23 12:06:33 macan> + * Time-stamp: <2012-08-07 14:24:53 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -37,6 +37,7 @@ #define MDS_DCONF_MAX_NAME_LEN 64 #define MDSL_DCONF_MAX_NAME_LEN MDS_DCONF_MAX_NAME_LEN #define ROOT_DCONF_MAX_NAME_LEN MDS_DCONF_MAX_NAME_LEN +#define OSD_DCONF_MAX_NAME_LEN MDS_DCONF_MAX_NAME_LEN #define HVFS_RING_VID_MAX 256 #define HVFS_GDT_BITMAP_COLUMN 0 /* default bitmap data column in GDT dir */ diff --git a/include/mds_api.h b/include/mds_api.h index 446ac1b..f2d0585 100644 --- a/include/mds_api.h +++ b/include/mds_api.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-17 11:32:21 macan> + * Time-stamp: <2012-08-09 11:00:10 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -82,6 +82,7 @@ struct hvfs_index #define INDEX_CREATE_GDT 0x00020000 /* create the gdt entry */ #define INDEX_CREATE_LARGE 0x00800000 /* create large file */ +#define INDEX_CREATE_OBJ 0x08000000 /* create large object file */ #define INDEX_CREATE_SMALL 0x00040000 /* create small file */ #define INDEX_CREATE_KV 0x00080000 /* create kv file */ @@ -206,6 +207,7 @@ struct mdu_update * linkadd operation */ #define MU_DEV (1 << 15) #define MU_SRR (1 << 16) /* suppress rename relocation fix */ +#define MU_OI (1 << 17) /* set object info */ u64 atime; u64 mtime; diff --git a/include/obj.h b/include/obj.h new file mode 100644 index 0000000..203e364 --- /dev/null +++ b/include/obj.h @@ -0,0 +1,64 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-14 10:47:02 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __HVFS_OSD_H__ +#define __HVFS_OSD_H__ + +struct objid +{ + u64 uuid; + u32 bid; + u32 len; +}; + +#define OBJID_EQUAL(a, b) ({ \ + int __res; \ + if ((a).uuid == (b).uuid && \ + (a).bid == (b).bid) \ + __res = 1; \ + else \ + __res = 0; \ + __res; \ + }) + +struct osd_list +{ + int size; + u64 site[0]; +}; + +/* OM type in om_init() */ +#define HVFS_OM_TYPE_MASTER 0x01 +#define HVFS_OM_TYPE_BACKUP 0x02 + +/* Object report */ +struct obj_report_tx +{ + /* + * if add_size < 0, then replace the old objids w/ current array + */ + int add_size, rmv_size; + struct objid ids[0]; +}; + +#endif diff --git a/include/site.h b/include/site.h index 4f6c138..899fdce 100644 --- a/include/site.h +++ b/include/site.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2010-12-03 16:53:45 macan> + * Time-stamp: <2012-08-05 11:17:38 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,6 +38,7 @@ #define HVFS_SITE_TYPE_RING 0x04 #define HVFS_SITE_TYPE_ROOT 0x04 /* Note that, RING and ROOT are the * same server now. */ +#define HVFS_SITE_TYPE_OSD 0x05 #define HVFS_SITE_TYPE_AMC 0x06 /* another metadata client */ #define HVFS_SITE_TYPE_BP 0x07 /* branch processor */ @@ -65,6 +66,9 @@ #define HVFS_IS_BP(site) (((site & HVFS_SITE_TYPE_MASK) >> 17) == \ HVFS_SITE_TYPE_BP) +#define HVFS_IS_OSD(site) (((site & HVFS_SITE_TYPE_MASK) >> 17) == \ + HVFS_SITE_TYPE_OSD) + #define HVFS_SITE_N_MASK ((1 << 17) - 1) #define HVFS_CLIENT(n) ((HVFS_SITE_TYPE_CLIENT << 17) | (n & HVFS_SITE_N_MASK)) @@ -80,4 +84,7 @@ #define HVFS_AMC(n) ((HVFS_SITE_TYPE_AMC << 17) | (n & HVFS_SITE_N_MASK)) #define HVFS_BP(n) ((HVFS_SITE_TYPE_BP << 17) | (n & HVFS_SITE_N_MASK)) + +#define HVFS_OSD(n) ((HVFS_SITE_TYPE_OSD << 17) | (n & HVFS_SITE_N_MASK)) + #endif diff --git a/include/xnet.h b/include/xnet.h index 963ea16..6f1cd2f 100644 --- a/include/xnet.h +++ b/include/xnet.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-18 11:02:40 macan> + * Time-stamp: <2012-08-06 15:05:44 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,7 +33,7 @@ struct xnet_msg_tx #define XNET_MSG_NOP 0 #define XNET_MSG_REQ 1 #define XNET_MSG_RPY 2 -#define XNET_MSG_CMD 3 +#define XNET_MSG_CMD 3 /* this is a local marker in recv queue */ #define XNET_MSG_HELLO 4 #define XNET_MSG_HELLO_ACK 5 u8 type; /* msg type */ @@ -303,6 +303,24 @@ int xnet_group_add(struct xnet_group **xg, u64 site_id) return err; } +static int __xg_compare(const void *a, const void *b) +{ + return ((struct xnet_group_entry *)a)->site_id < + ((struct xnet_group_entry *)b)->site_id ? -1 : + (((struct xnet_group_entry *)a)->site_id > + ((struct xnet_group_entry *)b)->site_id ? 1 : 0); +} + +static inline +void xnet_group_sort(struct xnet_group *xg) +{ + if (!xg) + return; + + qsort(xg->sites, xg->asize, sizeof(struct xnet_group_entry), + __xg_compare); +} + /* Profiling Section */ extern struct xnet_prof g_xnet_prof; extern struct xnet_conf g_xnet_conf; diff --git a/lib/Makefile b/lib/Makefile index e0b8f32..92397fe 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2012-05-18 12:41:08 macan> +# Time-stamp: <2012-08-10 13:59:08 macan> # # This is the makefile for HVFS project. # @@ -13,13 +13,13 @@ include ../Makefile.inc all : hvfs_lib %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(LIB_PATH)/%.c, $<) hvfs_lib : $(LIB_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libhvfs.a $(^:.c=.o) - @echo -e " " SL"\t" $(LIB_SO) + @$(ECHO) -e " " SL"\t" $(LIB_SO) @$(CC) -shared -Wl,-soname,libhvfs.so.1 -o $(LIB_PATH)/libhvfs.so.1.0 $(^:.c=.o) -lc -lrt -lpthread clean : diff --git a/lib/conf.c b/lib/conf.c index 213f492..14b8de0 100644 --- a/lib/conf.c +++ b/lib/conf.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-03-09 16:17:21 macan> + * Time-stamp: <2012-08-07 16:15:10 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -55,6 +55,8 @@ int get_site(parser_state_t *ps, char *line, *type = "mds"; } else if (strcmp(str, "client") == 0) { *type = "client"; + } else if (strcmp(str, "osd") == 0) { + *type = "osd"; } else if (strcmp(str, "amc") == 0) { *type = "amc"; } else if (strcmp(str, "bp") == 0) { @@ -123,6 +125,8 @@ u64 conf_site_id(char *type, int id) site_id = HVFS_RING(id); } else if (strcmp(type, "client") == 0) { site_id = HVFS_CLIENT(id); + } else if (strcmp(type, "osd") == 0) { + site_id = HVFS_OSD(id); } else if (strcmp(type, "amc") == 0) { site_id = HVFS_AMC(id); } else if (strcmp(type, "bp") == 0) { diff --git a/lib/lib.c b/lib/lib.c index 1fc11fd..34e28d1 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-29 02:42:35 macan> + * Time-stamp: <2012-08-10 14:05:02 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -65,7 +65,8 @@ void lib_init(void) strerror(errno), errno); return; } - fscanf(fp, "%ld", &cpu_frequency); + if (fscanf(fp, "%ld", &cpu_frequency) < 0) + return; pclose(fp); if (!cpu_frequency) cpu_frequency = 2000; @@ -267,8 +268,10 @@ void lib_backtrace(void) hvfs_info(lib, "%s\n", bts[i]); continue; } else { - fscanf(fp, "%s", str); - hvfs_info(lib, "%s %s\n", bts[i], str); + if (fscanf(fp, "%s", str) > 0) + hvfs_info(lib, "%s %s\n", bts[i], str); + else + hvfs_info(lib, "%s %s\n", bts[i], "Unknown"); } pclose(fp); } else { diff --git a/lib/ring.c b/lib/ring.c index 1619883..be8e455 100644 --- a/lib/ring.c +++ b/lib/ring.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-10-27 05:55:45 macan> + * Time-stamp: <2012-08-10 15:13:50 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -327,7 +327,7 @@ void ring_stat(struct chring *r, int nr) last_point = r->array[i].point; } for (i = 0; i < nr; i++) { - hvfs_info(lib, "rglen[%d] = %ld %.2f\\%\n", i, rglen[i], + hvfs_info(lib, "rglen[%d] = %ld %.2f%%\n", i, rglen[i], (double)rglen[i] / 0xffffffffffffffff * 100); } } diff --git a/mds/Makefile b/mds/Makefile index aaa7803..a759e5c 100644 --- a/mds/Makefile +++ b/mds/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-07-20 16:26:45 macan> +# Time-stamp: <2012-08-10 13:59:22 macan> # # This is the makefile for HVFS project. # @@ -13,15 +13,15 @@ include ../Makefile.inc all : mds_lib %.o : %.c $(mds_h_depend_files) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(MDS)/%.c, $<) mds_lib : $(MDS_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libmds.a $(^:.c=.o) - @echo -e " " SL"\t" $(MDS_SO) + @$(ECHO) -e " " SL"\t" $(MDS_SO) @$(CC) -shared -Wl,-soname,libmds.so.1 -o $(LIB_PATH)/libmds.so.1.0 $(^:.c=.o) -lc -lrt -lpthread clean : -@rm -rf $(MDS_AR_SOURCE:.c=.o) - -@rm -rf libmds.a libmds.so.1.0 \ No newline at end of file + -@rm -rf libmds.a $(LIB_PATH)/libmds.so.1.0 \ No newline at end of file diff --git a/mds/async.c b/mds/async.c index a141020..40ae7e7 100644 --- a/mds/async.c +++ b/mds/async.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-23 12:07:25 macan> + * Time-stamp: <2012-09-03 14:24:47 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -118,7 +118,7 @@ int __aur_itb_split(struct async_update_request *aur) hvfs_warning(xnet, "xnet_alloc_msg() failed, re-submit the" " AU request.\n"); au_submit(aur); - return -ENOMEM; + return -ERETRY; } /* Step 1: we should update the local bitmap */ mds_dh_bitmap_update(&hmo.dh, i->h.puuid, i->h.itbid, @@ -753,7 +753,8 @@ int __au_req_handle(void) aur->op, aur->arg); } atomic64_inc(&hmo.prof.misc.au_handle); - xfree(aur); + if (err != -ERETRY) + xfree(aur); return err; } @@ -800,7 +801,8 @@ void au_handle_split_sync(void) hvfs_err(mds, "AU (split/bitmap[%ld]) handle error %d\n", aur->op, err); } - xfree(aur); + if (err != -ERETRY) + xfree(aur); return; } diff --git a/mds/c2m.c b/mds/c2m.c index 0a24b6f..fd8fe37 100644 --- a/mds/c2m.c +++ b/mds/c2m.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-17 13:11:09 macan> + * Time-stamp: <2012-08-10 15:15:45 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -748,13 +748,11 @@ void __mdsdisp mds_lb(struct hvfs_tx *tx) void mds_dump_itb(struct hvfs_tx *tx) { struct hvfs_index *hi = NULL; - int err; /* sanity checking */ if (unlikely(tx->req->tx.len < sizeof(*hi))) { hvfs_err(mds, "Invalid DITB request %d recieved\n", tx->req->tx.reqno); - err = -EINVAL; goto out; } diff --git a/mds/cbht.c b/mds/cbht.c index ebd04d0..642914f 100644 --- a/mds/cbht.c +++ b/mds/cbht.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-10-08 09:45:28 macan> + * Time-stamp: <2012-08-10 15:14:25 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,7 +38,7 @@ int mds_seg_alloc(struct segment *s, struct eh *eh) err = -ENOMEM; } - return 0; + return err; } void mds_seg_free(struct segment *s) diff --git a/mds/m2m.c b/mds/m2m.c index dd447bf..6801eee 100644 --- a/mds/m2m.c +++ b/mds/m2m.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-11-12 23:18:30 macan> + * Time-stamp: <2012-08-10 15:16:41 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -430,7 +430,7 @@ void mds_ausplit_redo(void *data, int len) void mds_forward(struct xnet_msg *msg) { - struct mds_fwd *mf; + struct mds_fwd __UNUSED__ *mf; struct xnet_msg_tx *tx; /* FIXME: we know we are using xnet-simple, so all the receiving iovs are * packed into one buf, we should save the begin address here */ @@ -916,13 +916,11 @@ void mds_audirdelta(struct xnet_msg *msg) void mds_audirdelta_r(struct xnet_msg *msg) { struct hvfs_dir_delta *hdd; - int err = 0; /* sanity checking */ if (msg->tx.len < sizeof(struct hvfs_dir_delta)) { hvfs_err(mds, "Invalid AUDIRDELTA_R request %d received from %lx\n", msg->tx.reqno, msg->tx.ssite_id); - err = -EINVAL; goto out; } diff --git a/mds/redo.c b/mds/redo.c index d6efd71..279088e 100644 --- a/mds/redo.c +++ b/mds/redo.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-08-23 11:22:40 macan> + * Time-stamp: <2012-08-17 09:44:46 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -72,6 +72,7 @@ struct redo_logger pthread_t thread; int thread_stop:1; + int is_active:1; sem_t sem; struct redo_logger_local_queue rllq; struct redo_logger_queue *rlq; @@ -96,7 +97,7 @@ static inline struct xnet_group *__get_active_site(struct chring *r) { struct xnet_group *xg = NULL; - int i, err; + int i, __UNUSED__ err; for (i = 0; i < r->used; i++) { err = xnet_group_add(&xg, r->array[i].site_id); @@ -321,8 +322,14 @@ void __write_log_entry(struct redo_log_site *rls) if (cnt <= 5) goto retry; else { + int err = 0; + hvfs_err(mds, "Completely corrupted file, truncate it!\n"); - ftruncate(g_rl.rlfd, 0); + + if ((err = ftruncate(g_rl.rlfd, 0)) < 0) { + hvfs_err(mds, "truncate file failed w/ %d (%s)!\n", + err, strerror(err)); + } goto out; } } @@ -492,6 +499,7 @@ int redo_log_init(struct chring *r, int replica_nr) atomic64_set(&g_rl.client_redo_nr, 0); atomic64_set(&g_rl.in_rep_redo_nr, 0); atomic64_set(&g_rl.reap_rep_redo_nr, 0); + g_rl.is_active = 1; /* get the reap interval from EV */ { @@ -591,6 +599,8 @@ int redo_log_init(struct chring *r, int replica_nr) xg->sites[e].site_id, i, replica_nr - 1); } } + } else { + g_rl.is_active = 0; } hvfs_info(mds, "Setting up %d replica(s) for site %lx\n", @@ -698,7 +708,7 @@ struct redo_log_site *add_cli_log_entry(u64 txg, u32 id, u16 op, { struct redo_log_site *rls; - if (hmo.state < HMO_STATE_RUNNING) + if (hmo.state < HMO_STATE_RUNNING || !g_rl.is_active) return NULL; rls = __add_cli_log_entry(NULL, txg, id, op, dlen, hi, data); @@ -719,7 +729,7 @@ struct redo_log_site *add_create_log_entry(u64 txg, u32 dlen, struct redo_log_site *rls; struct gdt_md *go = data, *gi; - if (hmo.state < HMO_STATE_RUNNING) + if (hmo.state < HMO_STATE_RUNNING || !g_rl.is_active) return NULL; if (hi->flag & INDEX_CREATE_GDT) { @@ -747,7 +757,7 @@ struct redo_log_site *add_ausplit_log_entry(u64 txg, u64 ssite, u32 dlen, { struct redo_log_site *rls = NULL; - if (hmo.state < HMO_STATE_RUNNING) + if (hmo.state < HMO_STATE_RUNNING || !g_rl.is_active) return NULL; rls = xzalloc(sizeof(*rls) + dlen); diff --git a/mds/txg.c b/mds/txg.c index 87ee15b..6969522 100644 --- a/mds/txg.c +++ b/mds/txg.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-07-22 15:18:15 macan> + * Time-stamp: <2012-09-03 16:00:29 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -726,6 +726,8 @@ int txg_wb_itb(struct commit_thread_arg *cta, struct hvfs_txg *t, (*freed)++; } else if (ih->state == ITB_STATE_DIRTY) { /* write w/ lock holding */ + hvfs_debug(mds, "ITB %ld %p state %x, ref %d, flag %d.\n", + ih->itbid, i, ih->state, atomic_read(&ih->ref), ih->flag); if (tmpi) { /* ok, we just copy the itb to the temp itb */ memcpy(tmpi, i, atomic_read(&i->h.len)); @@ -1300,7 +1302,7 @@ static inline struct xnet_group *__get_active_site(struct chring *r) { struct xnet_group *xg = NULL; - int i, err; + int i, __UNUSED__ err; for (i = 0; i < r->used; i++) { err = xnet_group_add(&xg, r->array[i].site_id); diff --git a/mdsl/Makefile b/mdsl/Makefile index 850eec8..3095863 100644 --- a/mdsl/Makefile +++ b/mdsl/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-06-23 22:17:49 macan> +# Time-stamp: <2012-08-10 13:59:35 macan> # # This is the makefile for HVFS project. # @@ -13,15 +13,15 @@ include ../Makefile.inc all : mdsl_lib %.o : %.c $(mdsl_h_depend_files) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(MDSL)/%.c, $<) mdsl_lib : $(MDSL_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libmdsl.a $(^:.c=.o) - @echo -e " " SL"\t" $(MDSL_SO) + @$(ECHO) -e " " SL"\t" $(MDSL_SO) @$(CC) -shared -Wl,-soname,libmdsl.so.1 -o $(LIB_PATH)/libmdsl.so.1.0 $(^:.c=.o) -lc -lrt -lpthread clean : -@rm -rf $(MDSL_AR_SOURCE:.c=.o) - -@rm -rf libmdsl.a libmdsl.so.1.0 \ No newline at end of file + -@rm -rf libmdsl.a $(LIB_PATH)/libmdsl.so.1.0 \ No newline at end of file diff --git a/mdsl/c2ml.c b/mdsl/c2ml.c index 5da2447..fd94b7e 100644 --- a/mdsl/c2ml.c +++ b/mdsl/c2ml.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-27 22:29:00 macan> + * Time-stamp: <2012-08-10 17:11:47 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -334,7 +334,7 @@ void mdsl_write(struct xnet_msg *msg) void mdsl_statfs(struct xnet_msg *msg) { struct statfs *s = (struct statfs *)xzalloc(sizeof(struct statfs)); - struct iovec iov; + struct iovec iov = {0,}; int err = 0; /* ABI: diff --git a/mdsl/m2ml.c b/mdsl/m2ml.c index 07f59e7..abf7864 100644 --- a/mdsl/m2ml.c +++ b/mdsl/m2ml.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-10-20 12:27:39 macan> + * Time-stamp: <2012-08-10 15:02:31 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -547,12 +547,10 @@ void mdsl_bitmap_commit(struct xnet_msg *msg) struct fdhash_entry *fde; struct mdsl_storage_access msa; struct bc_commit_core *bcc; - size_t len; u64 location = -1UL; u64 size = 0; int err = 0; - len = msg->tx.len; if (msg->xm_datacheck) bcc = msg->xm_data; else @@ -700,12 +698,10 @@ void mdsl_bitmap_commit_v2(struct xnet_msg *msg) struct mdsl_storage_access msa; struct bc_commit_core *bcc; union bmmap_disk *bd; - size_t len; u64 location = -1UL, update_location = -1UL; u64 size = 0; int err = 0, nr; - len = msg->tx.len; if (msg->xm_datacheck) bcc = msg->xm_data; else @@ -930,12 +926,14 @@ void mdsl_wbtxg(struct xnet_msg *msg) "nlink %d\n", hdd->site_id, hdd->duuid, hdd->flag, atomic_read(&hdd->nlink)); + hdd = (void *)hdd + sizeof(*hdd); } + ASSERT(p + region_len == hdd, mdsl); } } if (msg->tx.arg0 & HVFS_WBTXG_R_DIR_DELTA) { /* FIXME: should we do sth on this region? */ - size_t region_len = 0; + size_t __UNUSED__ region_len = 0; loff_t offset = tb->dir_delta_nr * sizeof(struct hvfs_dir_delta); if (tb && toe && toe->other_region) { @@ -947,12 +945,14 @@ void mdsl_wbtxg(struct xnet_msg *msg) for (i = 0; i < tb->rdd_nr; i++) { hvfs_err(mdsl, "HDD site %lx uuid %ld flag %x nlink %d\n", hdd->site_id, hdd->duuid, hdd->flag, hdd->nlink); + hdd = (void *)hdd + sizeof(*hdd); } + ASSERT(p + region_len == hdd, mdsl); #endif } } if (msg->tx.arg0 & HVFS_WBTXG_BITMAP_DELTA) { - size_t region_len = 0; + size_t __UNUSED__ region_len = 0; loff_t offset = tb->dir_delta_nr * sizeof(struct hvfs_dir_delta) + tb->rdd_nr * sizeof(struct hvfs_dir_delta); @@ -966,7 +966,9 @@ void mdsl_wbtxg(struct xnet_msg *msg) hvfs_err(mdsl, "sid %lx uuid %ld oitb %ld nitb %ld\n", (bd + i)->site_id, (bd + i)->uuid, (bd + i)->oitb, (bd + i)->nitb); + bd = (void *)bd + sizeof(*bd); } + ASSERT(p + region_len == bd, mdsl); #endif } } @@ -995,6 +997,7 @@ void mdsl_wbtxg(struct xnet_msg *msg) *(rd + i), err); } } + ASSERT(region_len >= 0, mdsl); } } } diff --git a/mdsl/mdsl.c b/mdsl/mdsl.c index 378def6..d1c6b72 100644 --- a/mdsl/mdsl.c +++ b/mdsl/mdsl.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-08-07 11:05:57 macan> + * Time-stamp: <2012-08-09 14:16:41 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -460,7 +460,7 @@ void mdsl_help(void) " hvfs_mdsl_gc_interval wakeup interval for gc thread.\n" " hvfs_mdsl_opt_write_drop drop the writes to this MDSL.\n" ); - hvfs_plain(mdsl, "Any questions please contacts Ma Can \n"); + hvfs_plain(mdsl, "Any questions please contacts Ma Can.\n"); } /* mdsl_init() @@ -472,7 +472,6 @@ int mdsl_init(void) /* lib init */ lib_init(); - mdsl_pre_init(); /* FIXME: decode the cmdline */ /* FIXME: configurations */ diff --git a/mdsl/mdsl.h b/mdsl/mdsl.h index fbdff2a..cb54867 100644 --- a/mdsl/mdsl.h +++ b/mdsl/mdsl.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-06-07 00:04:15 macan> + * Time-stamp: <2012-08-10 15:31:52 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -516,6 +516,8 @@ int __mdisk_add_range_nolock(struct fdhash_entry *fde, u64 begin, u64 end, u64 range_id); void __mdisk_range_sort(void *ranges, size_t size); int append_buf_destroy_async(struct fdhash_entry *fde); +int mdsl_storage_bulk_load(struct fdhash_entry *fde, + struct mdsl_storage_access *msa); /* defines for buf flush */ #define ABUF_ASYNC 0x01 diff --git a/mdsl/mdsl_config.h b/mdsl/mdsl_config.h index a22a66f..ac943d3 100644 --- a/mdsl/mdsl_config.h +++ b/mdsl/mdsl_config.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-04-23 13:06:33 macan> + * Time-stamp: <2012-08-07 14:38:10 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -64,7 +64,7 @@ #define HVFS_MDSL_GET_kmg(name, value) do { \ double base; \ char *p; \ - (value) = getenv("hvfs_mds_" #name); \ + (value) = getenv("hvfs_mdsl_" #name); \ if (value) { \ base = strtod(value, &p); \ if (*p == 'g' || *p == 'G') { \ diff --git a/mdsl/storage.c b/mdsl/storage.c index f2d4b12..019daa4 100644 --- a/mdsl/storage.c +++ b/mdsl/storage.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-08-06 18:05:17 macan> + * Time-stamp: <2012-08-10 15:15:16 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -815,6 +815,9 @@ int mdsl_storage_init(void) "change state to RECOVERY\n", err); } + hvfs_info(mdsl, "MDSL %lx log recovery return %s\n", + hmo.site_id, + (err == 0 ? "CLEAN" : "?DIRTY?")); return 0; } @@ -3198,4 +3201,5 @@ int mdsl_storage_bulk_load(struct fdhash_entry *fde, /* syncer helpers */ int __sync_md(u64 duuid) { + return 0; } diff --git a/mdsl/syncer.c b/mdsl/syncer.c new file mode 100644 index 0000000..93c8555 --- /dev/null +++ b/mdsl/syncer.c @@ -0,0 +1,315 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-06-08 00:02:04 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "mdsl.h" + +struct sync_record +{ + struct list_head list; + + u64 duuid; +#define SYNC_NONE 0x00 +#define SYNC_MD 0x01 +#define SYNC_RANGE 0x02 +#define SYNC_DATA 0x04 +#define SYNC_ALL 0xff + u8 flag; +}; + +struct syncer_mgr +{ + struct list_head queue; + xlock_t qlock; + sem_t qsem; + + /* replicas, select N - 1 remote nodes to replicate */ + u64 *replica_sites; /* which site to send */ + u64 *replicated_sites; /* which site to notify sync */ + int replica_nr; + int replicated_nr; + + /* sync state */ +#define SYNC_STATE_INIT 0x00 /* init state */ +#define SYNC_STATE_CHK 0x01 /* booting check */ +#define SYNC_STATE_ADDIN 0x02 /* add a new sync node */ +#define SYNC_STATE_NORM 0x03 /* normal state */ + u8 state; +#define SYNC_FREE 0x00 +#define SYNC_ING 0x01 /* there is an active syncing */ + u8 sync_state; + + /* thread info */ + pthread_t syncer_thread; + u8 syncer_thread_stop; +}; + +struct syncer_thread_arg +{ + int tid; +}; + +static struct syncer_mgr g_sm; + +int syncer_add(u64 duuid, u8 flag) +{ + struct sync_record *sr; + + sr = xzalloc(*sr); + if (!sr) { + return -ENOMEM; + } + + INIT_LIST_HEAD(&sr->list); + /* no argument checking */ + sr->duuid = duuid; + sr->flag = flag; + + xlock_lock(&g_sm.qlock); + list_add_tail(&sr->list, &g_sm.queue); + xlock_unlock(&g_sm.qlock); + + /* notify the syncer */ + sem_post(&g_sm.qsem); + + return 0; +} + +static inline +int __serv_request(void) +{ + struct sync_record *sr = NULL, *pos, *n; + u64 site; + int err = 0; + + xlock_lock(&g_sm.qlock); + list_for_each_entry_safe(pos, n, &g_sm.queue, list) { + list_del_init(&pos->list); + sr = pos; + break; + } + xlock_unlock(&g_sm.qlock); + + if (!sr) + return -EHSTOP; + + /* ok, deal with it */ + /* Step 1: determine which node to sync? */ + + /* Step 2: compare and sync the specific files */ + if (sr->flag & SYNC_MD) { + err = __sync_md(sr->duuid); + if (err) { + hvfs_err(mdsl, "Sync dir %lx MD failed w/ %d\n", + sr->duuid, err); + goto out_failed; + } + } + if (sr->flag & SYNC_RANGE) { + err = __sync_range(sr->duuid); + if (err) { + hvfs_err(mdsl, "Sync dir %lx MD failed w/ %d\n", + sr->duuid, err); + goto out_failed; + } + } + if (sr->flag & SYNC_DATA) { + err = __sync_data(sr->duuid); + if (err) { + hvfs_err(mdsl, "Sync dir %lx MD failed w/ %d\n", + sr->duuid, err); + goto out_failed; + } + } + + return err; +out_failed: + xlock_lock(&g_sm.qlock); + list_add_tail(&sr->list, &g_sm.queue); + xlock_unlock(&g_sm.qlock); + return err; +} + +static +void *syncer_main(void *arg) +{ + struct syncer_thread_arg *sta = (struct syncer_thread_arg *)arg; + sigset_t set; + int err = 0; + + /* first, let us block the SIGALRM and SIGCHLD */ + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigaddset(&set, SIGCHLD); + pthread_sigmask(SIG_BLOCK, &set, NULL); /* oh, we do not care about the + * errs */ + while (!g_sm.syncer_thread_stop) { + err = sem_wait(&g_sm.qsem); + if (err == EINTR) + continue; + + /* trying to handle more and more sync request */ + while (1) { + err = __serv_request(); + if (err == -EHSTOP) + break; + else if (err) { + hvfs_err(mdsl, "Syncer thread handle request w/ err %d\n", + err); + } + } + } + pthread_exit(0); +} + +/* Scan the on-disk sync directory + * + * MDSL_HOME/600xx/sync/mgr {sync_mgr} + * MDSL_HOME/600xx/sync/600yy/ + * MDSL_HOME/600xx/sync/600zz/ + * ... + * + */ +int syncer_disk_scan() +{ + int err = 0; + + return err; +} + +int syncer_init(int replica_nr) +{ + char path[256]; + struct xnet_group *xg; + pthread_attr_t attr; + int err = 0, stacksize; + + /* make sure the dir exists */ + sprintf(path, "%s/%lx/sync", hmo.conf.mdsl_home, hmo.site_id); + err = mdsl_dir_make_exist(path); + if (err) { + hvfs_err(mdsl, "dir %s does not exist %d.\n", path, err); + return -ENOTEXIST; + } + + /* prepare the new replica view */ + if (replica_nr < 2) { + hvfs_warning(mdsl, "Setting no HARD replica for this node.\n"); + } else if (replica_nr >= 2) { + g_sm.replica_sites = xzalloc((replica_nr - 1)* sizeof(u64)); + if (!g_sm.replica_sites) { + hvfs_err(mdsl, "xzalloc() relica_sites' array failed\n"); + err = -ENOMEM; + goto out; + } + + /* select replicas */ + xg = __get_active_site(r); + if (!xg) { + hvfs_warning(mdsl, "Only use local logger, not HA now!\n"); + replica_nr = 1; + } else { + /* sort the group, thus we will get consistent group view */ + xnet_group_sort(xg); + + if (replica_nr > xg->asize) { + /* this means that user defined replica is larger than active + * site group, we decrease the replica_nr */ + replica_nr = xg->asize; + } + + /* select next replica_nr - 1 sites from xg group for load + * balance */ + for (i = 0; i < xg->asize; i++) { + if (xg->sites[i].site_id == hmo.site_id) { + e = i; + break; + } + } + + for (i = 0; i < replica_nr - 1; i++) { + reselect: + e = NEXT_SITE(e, xg); + if (xg->sites[e].site_id == hmo.site_id) + goto reselect; + for (j = 0; j < i; j++) { + if (xg->sites[e].site_id == g_sm.replica_sites[j]) { + /* conflict, reselect */ + goto reselect; + } + } + g_sm.replica_sites[i] = xg->sites[e].site_id; + hvfs_info(mdsl, "Select site %lx as my SYNC replica <%d/%d>.\n", + xg->sites[e].site_id, i, replica_nr - 1); + } + } + } + + /* check the on-disk replica view */ + err = syncer_disk_scan(); + if (err) { + hvfs_err(mdsl, "Scan syncer disk state failed w/ %d\n", err); + goto out; + } + + /* init the thread stack size */ + err = pthread_attr_init(&attr); + if (err) { + hvfs_err(mdsl, "Init pthread attr failed\n"); + goto out; + } + stacksize = (hmo.conf.stacksize > (1 << 20) ? + hmo.conf.stacksize : (1 << 20)); + err = pthread_attr_setstacksize(&attr, stacksize); + if (err) { + hvfs_err(mdsl, "set thread stack size to %d failed w/ %d\n", + stacksize, err); + goto out; + } + + /* init the mgr struct */ + INIT_LIST_HEAD(&g_sm.queue); + xlock_init(&g_sm.qlock); + sem_init(&g_sm.qsem, 0, 0); + + /* init syncer thread */ + err = pthread_create(&g_sm.syncer_thread, &attr, &syncer_main, NULL); + if (err) { + hvfs_err(mdsl, "create syncer thread failed w/ %d\n", err); + goto out; + } + +out: + return err; +} + +void syncer_destory() +{ + g_sm.syncer_thread_stop = 1; + sem_post(&g_sm.qsem); + /* FIXME: shall we wait for all the pending sync records are handled? */ + pthread_join(g_sm.syncer_thread, NULL); + sem_destroy(&g_sm.qsem); +} + diff --git a/osd/Makefile b/osd/Makefile new file mode 100644 index 0000000..5fd7a78 --- /dev/null +++ b/osd/Makefile @@ -0,0 +1,27 @@ +## +# Copyright (c) 2009 Ma Can +# +# +# Time-stamp: <2012-08-10 13:59:50 macan> +# +# This is the makefile for HVFS project. +# +# Armed with EMACS. + +include ../Makefile.inc + +all : osd_lib + +%.o : %.c $(osd_h_depend_files) + @$(ECHO) -e " " CC"\t" $@ + @$(CC) $(CFLAGS) -c $(patsubst %.c, $(OSD)/%.c, $<) + +osd_lib : $(OSD_AR_SOURCE:.c=.o) + @$(ECHO) -e " " AR"\t" $@ + @$(AR) rcs libosd.a $(^:.c=.o) + @$(ECHO) -e " " SL"\t" $(OSD_SO) + @$(CC) -shared -Wl,-soname,libosd.so.1 -o $(LIB_PATH)/libosd.so.1.0 $(^:.c=.o) -lc -lrt -lpthread + +clean : + -@rm -rf $(OSD_AR_SOURCE:.c=.o) + -@rm -rf libosd.a $(LIB_PATH)/libosd.so.1.0 \ No newline at end of file diff --git a/osd/dispatch.c b/osd/dispatch.c new file mode 100644 index 0000000..296a198 --- /dev/null +++ b/osd/dispatch.c @@ -0,0 +1,115 @@ +/** + * Copyright (c) 2009 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-07 14:48:02 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "osd.h" + +/* control the object read to N - 1 spool threads */ +atomic_t obj_reads = {.counter = 0,}; + +void osd_handle_err(struct xnet_msg *msg, int err) +{ + xnet_free_msg(msg); +} + +static +int osd_ring_dispatch(struct xnet_msg *msg) +{ + switch (msg->tx.cmd) { + case HVFS_FR2_RU: + hvfs_warning(osd, "Ignore R2 ring update request.\n"); + break; + case HVFS_FR2_AU: + osd_addr_table_update(msg); + break; + default: + hvfs_err(osd, "Invalid R2 request: 0x%lx\n", msg->tx.cmd); + xnet_free_msg(msg); + } + + return 0; +} + +/* osd_dispatch() + * + * The first dispatcher of OSD + */ +int osd_dispatch(struct xnet_msg *msg) +{ + int err = 0; + + /* check the state here */ +l0_recheck: + switch (hoo.state) { + case HOO_STATE_INIT: + /* wait */ + while (hoo.state == HOO_STATE_INIT) { + sched_yield(); + } + /* recheck it */ + goto l0_recheck; + case HOO_STATE_LAUNCH: + /* reinsert back to reqin list unless it is a RECOVERY request from + * RING server */ + if (HVFS_IS_RING(msg->tx.ssite_id)) { + return osd_ring_dispatch(msg); + } else + osd_spool_redispatch(msg, 0); + return -EAGAIN; + case HOO_STATE_RUNNING: + break; + case HOO_STATE_PAUSE: + /* enable reqin quest dropping after handling existing requests */ + break; + case HOO_STATE_RDONLY: + /* drop modify requests */ + break; + case HOO_STATE_OFFLINE: + /* drop object r/w requests */ + break; + default: + HVFS_BUGON("Unknown OSD state"); + } + + switch (msg->tx.cmd) { + case HVFS_OSD_READ: + break; + case HVFS_OSD_WRITE: + break; + case HVFS_OSD_SYNC: + break; + case HVFS_OSD_STATFS: + break; + default: + if (HVFS_IS_RING(msg->tx.ssite_id)) { + return osd_ring_dispatch(msg); + } + + hvfs_err(osd, "OSD core dispatcher handle INVALID request <0x%lx %d>\n", + msg->tx.ssite_id, msg->tx.reqno); + osd_handle_err(msg, -EINVAL); + } + + return err; +} diff --git a/osd/lprof.h b/osd/lprof.h new file mode 100644 index 0000000..9cd6df1 --- /dev/null +++ b/osd/lprof.h @@ -0,0 +1,96 @@ +/** + * Copyright (c) 2009 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-07 15:06:47 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __OSD_PROF_H__ +#define __OSD_PROF_H__ + +struct osd_client_prof +{ + atomic64_t objrnr; /* # of read objects */ + atomic64_t objwnr; /* # of write objects */ + atomic64_t objrbytes; /* # of read bytes */ + atomic64_t objwbytes; /* # of write bytes */ +}; + +struct osd_ring_prof +{ + atomic64_t update; /* # of ring update msg */ + atomic64_t size; /* total size of ring update msg */ +}; + +struct osd_mds_prof +{ +}; + +struct osd_mdsl_prof +{ + atomic64_t objrnr; /* # of read objects */ + atomic64_t objwnr; /* # of write objects */ + atomic64_t objrbytes; /* # of read bytes */ + atomic64_t objwbytes; /* # of write bytes */ +}; + +struct osd_misc_prof +{ + atomic64_t reqin_total; /* # of total requests coming in */ + atomic64_t reqin_handle; /* # of handled requests */ +}; + +struct osd_storage_prof +{ + atomic64_t wbytes; /* # of bytes written */ + atomic64_t rbytes; /* # of bytes read */ + atomic64_t wreq; /* # of requests written */ + atomic64_t rreq; /* # of requests read */ + atomic64_t cpbytes; /* # of bytes copied to mmap region */ +}; + +struct osd_prof +{ + time_t ts; + struct osd_client_prof client; + struct osd_ring_prof ring; + struct osd_mds_prof mds; + struct osd_mdsl_prof mdsl; + struct osd_misc_prof misc; + struct osd_storage_prof storage; + struct xnet_prof *xnet; +}; + +#ifdef hvfs_pf +#undef hvfs_pf +#endif + +#ifndef hvfs_pf +#define hvfs_pf(f, a...) do { \ + if (hoo.conf.pf_file) { \ + FPRINTK(hoo.conf.pf_file, f, ## a); \ + FFLUSH(hoo.conf.pf_file); \ + } else { \ + PRINTK(f, ## a); \ + FFLUSH(stdout); \ + } \ + } while (0) +#endif + +#endif diff --git a/osd/osd.c b/osd/osd.c new file mode 100644 index 0000000..7b135e4 --- /dev/null +++ b/osd/osd.c @@ -0,0 +1,520 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-14 16:29:34 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "osd.h" +#include "lib.h" + +#ifdef HVFS_TRACING +u32 hvfs_osd_tracing_flags = HVFS_DEFAULT_LEVEL; +#endif + +/* global variable */ +struct hvfs_osd_info hoi; +struct hvfs_osd_object hoo; + +static void osd_sigaction_default(int signo, siginfo_t *info, void *arg) +{ + if (signo == SIGSEGV) { + hvfs_info(osd, "Recv %sSIGSEGV%s %s\n", + HVFS_COLOR_RED, + HVFS_COLOR_END, + SIGCODES(info->si_code)); + lib_segv(signo, info, arg); + } else if (signo == SIGBUS) { + hvfs_info(osd, "Recv %sSIGBUS%s %s\n", + HVFS_COLOR_RED, + HVFS_COLOR_END, + SIGCODES(info->si_code)); + lib_segv(signo, info, arg); + } else if (signo == SIGHUP) { + hvfs_info(osd, "Exit OSD Server ...\n"); + osd_destroy(); + exit(0); + } else if (signo == SIGUSR1) { + hvfs_info(osd, "Exit some threads ...\n"); + pthread_exit(0); + } + +} + +/* osd_init_signal() + */ +static int osd_init_signal(void) +{ + struct sigaction ac; + int err; + + ac.sa_sigaction = osd_sigaction_default; + err = sigemptyset(&ac.sa_mask); + if (err) { + err = errno; + goto out; + } + ac.sa_flags = SA_SIGINFO; + +#ifndef UNIT_TEST + err = sigaction(SIGTERM, &ac, NULL); + if (err) { + err = errno; + goto out; + } + err = sigaction(SIGHUP, &ac, NULL); + if (err) { + err = errno; + goto out; + } + /* FIXME: mask the SIGINT for testing */ +#if 0 + err = sigaction(SIGINT, &ac, NULL); + if (err) { + err = errno; + goto out; + } +#endif + err = sigaction(SIGSEGV, &ac, NULL); + if (err) { + err = errno; + goto out; + } + err = sigaction(SIGBUS, &ac, NULL); + if (err) { + err = errno; + goto out; + } + err = sigaction(SIGQUIT, &ac, NULL); + if (err) { + err = errno; + goto out; + } + err = sigaction(SIGUSR1, &ac, NULL); + if (err) { + err = errno; + goto out; + } +#endif + +out: + return err; +} + +static void osd_itimer_default(int signo, siginfo_t *info, void *arg) +{ + u64 cur = time(NULL); + + sem_post(&hoo.timer_sem); + /* Note that, we must check the profiling interval at here, otherwise + * checking the profiling interval at timer_thread will lost some + * statistics */ + osd_dump_profiling(cur, &hoo.hp); + hoo.tick = cur; + hvfs_verbose(osd, "Did this signal handler called?\n"); + + return; +} + +static int __gcd(int m, int n) +{ + int r, temp; + + if (!m && !n) + return 0; + else if (!m) + return n; + else if (!n) + return m; + + if (m < n) { + temp = m; + m = n; + n = temp; + } + r = m; + while (r) { + r = m % n; + m = n; + n = r; + } + + return m; +} + +static void osd_hb_wrapper(time_t t) +{ + static time_t prev = 0; + + if (!hoo.cb_hb) + return; + + if (t < prev + hoo.conf.hb_interval) + return; + prev = t; + hoo.cb_hb(&hoo); +} + +static void *osd_timer_thread_main(void *arg) +{ + sigset_t set; + time_t cur; + int v, err; + + hvfs_debug(osd, "I am running...\n"); + + /* first, let us block the SIGALRM */ + sigemptyset(&set); + sigaddset(&set, SIGALRM); + pthread_sigmask(SIG_BLOCK, &set, NULL); /* oh, we do not care about the + * errs */ + /* then, we loop for the timer events */ + while (!hoo.timer_thread_stop) { + err = sem_wait(&hoo.timer_sem); + if (err == EINTR) + continue; + sem_getvalue(&hoo.timer_sem, &v); + hvfs_debug(osd, "OK, we receive a SIGALRM event(remain %d).\n", v); + cur = time(NULL); + /* should we work now */ + osd_dump_profiling(cur, &hoo.hp); + /* check the pending IOs */ + //osd_storage_pending_io(); + /* do heart beat */ + osd_hb_wrapper(cur); + } + + hvfs_debug(osd, "Hooo, I am exiting...\n"); + pthread_exit(0); +} + +static int osd_setup_timers(void) +{ + struct sigaction ac; + struct itimerval value, ovalue, pvalue; + int which = ITIMER_REAL, interval; + int err; + + /* init the timer semaphore */ + sem_init(&hoo.timer_sem, 0, 0); + + /* ok, we create the timer thread now */ + err = pthread_create(&hoo.timer_thread, NULL, &osd_timer_thread_main, + NULL); + if (err) + goto out; + /* then, we setup the itimers */ + memset(&ac, 0, sizeof(ac)); + sigemptyset(&ac.sa_mask); + ac.sa_flags = 0; + ac.sa_sigaction = osd_itimer_default; + err = sigaction(SIGALRM, &ac, NULL); + if (err) { + err = errno; + goto out; + } + err = getitimer(which, &pvalue); + if (err) { + err = errno; + goto out; + } + interval = __gcd(hoo.conf.profiling_thread_interval, + hoo.conf.hb_interval); + if (interval) { + value.it_interval.tv_sec = interval; + value.it_interval.tv_usec = 0; + value.it_value.tv_sec = interval; + value.it_value.tv_usec = 0; + err = setitimer(which, &value, &ovalue); + if (err) { + err = errno; + goto out; + } + hvfs_debug(osd, "OK, we have created a timer thread to " + " profile events every %d second(s).\n", interval); + } else { + hvfs_debug(osd, "Hoo, there is no need to setup itimers based on the" + " configration.\n"); + hoo.timer_thread_stop = 1; + } + +out: + return err; +} + +void osd_reset_itimer(void) +{ + struct itimerval value, ovalue, pvalue; + int err, interval; + + err = getitimer(ITIMER_REAL, &pvalue); + if (err) { + goto out; + } + interval = __gcd(hoo.conf.profiling_thread_interval, + hoo.conf.hb_interval); + if (interval) { + value.it_interval.tv_sec = interval; + value.it_interval.tv_usec = 0; + value.it_value.tv_sec = interval; + value.it_value.tv_usec = 0; + err = setitimer(ITIMER_REAL, &value, &ovalue); + if (err) { + goto out; + } + hvfs_info(osd, "OK, we reset the itimer to %d second(s).\n", + interval); + } +out: + return; +} + +/* osd_pre_init() + */ +void osd_pre_init(void) +{ + /* prepare the hoi & hoo */ + memset(&hoi, 0, sizeof(hoi)); + memset(&hoo, 0, sizeof(hoo)); +#ifdef HVFS_DEBUG_LOCK + lock_table_init(); +#endif + /* setup the state */ + hoo.state = HOO_STATE_INIT; +} + +/* osd_verify() hoo.site_id is ready now + */ +int osd_verify(void) +{ + char path[256] = {0, }; + int err = 0; + + /* check the OSD_HOME */ + err = osd_storage_dir_make_exist(hoo.conf.osd_home); + if (err) { + hvfs_err(osd, "dir %s do not exist.\n", hoo.conf.osd_home); + goto out; + } + + /* check the OSD site directory */ + sprintf(path, "%s/%lx", hoo.conf.osd_home, hoo.site_id); + err = osd_storage_dir_make_exist(path); + if (err) { + hvfs_err(osd, "dir %s do not exist.\n", path); + goto out; + } + + /* check if we need a recovery */ + if (hoo.aux_state) { + //err = osd_do_recovery(); + if (err) { + hvfs_err(osd, "OSD do recovery failed w/ %d\n", + err); + } + hoo.aux_state = 0; + } + + /* setup running state */ + hoo.state = HOO_STATE_RUNNING; + + /* write down a LOG pair to indicate a new instance */ + osd_startup_normal(); + +out: + return err; +} + +/* osd_config() + * + * Get configuration from the execution environment + */ +int osd_config(void) +{ + char *value; + + if (hoo.state != HOO_STATE_INIT) { + hvfs_err(osd, "OSD state is not in launching, please call " + "osd_pre_init() firstly\n"); + return -EINVAL; + } + + HVFS_OSD_GET_ENV_strncpy(dcaddr, value, OSD_DCONF_MAX_NAME_LEN); + HVFS_OSD_GET_ENV_cpy(osd_home, value); + HVFS_OSD_GET_ENV_cpy(profiling_file, value); + HVFS_OSD_GET_ENV_cpy(conf_file, value); + HVFS_OSD_GET_ENV_cpy(log_file, value); + + HVFS_OSD_GET_ENV_atoi(spool_threads, value); + HVFS_OSD_GET_ENV_atoi(aio_threads, value); + HVFS_OSD_GET_ENV_atoi(prof_plot, value); + HVFS_OSD_GET_ENV_atoi(profiling_thread_interval, value); + HVFS_OSD_GET_ENV_atoi(stacksize, value); + + HVFS_OSD_GET_ENV_option(write_drop, WDROP, value); + + /* set default osd home */ + if (!hoo.conf.osd_home) { + hoo.conf.osd_home = HVFS_OSD_HOME; + } + + if (!hoo.conf.spool_threads) + hoo.conf.spool_threads = 8; /* double # of CPUs */ + + if (!hoo.conf.profiling_thread_interval) + hoo.conf.profiling_thread_interval = 5; + if (!hoo.conf.hb_interval) + hoo.conf.hb_interval = 60; + + return 0; +} + +/* osd_help() + */ +void osd_help(void) +{ + hvfs_plain(osd, "OSD build @ %s on %s\n", CDATE, CHOST); + hvfs_plain(osd, "Usage: [EV list] osd\n\n"); + hvfs_plain(osd, "General Environment Variables:\n" + " hvfs_osd_dcaddr dynamic config addr for " + "UNIX sockets.\n" + " hvfs_osd_profiling_file profiling file name.\n" + " hvfs_osd_conf_file config file name.\n" + " hvfs_osd_log_file log file name.\n" + " hvfs_osd_spool_threads spool threads nr.\n" + " hvfs_osd_prof_plot output for gnuplot.\n" + " hvfs_osd_profiling_thread_interval\n" + " wakeup interval for prof thread.\n" + " hvfs_osd_opt_write_drop drop the writes to this OSD.\n" + ); + hvfs_plain(osd, "Any questions please contacts Ma Can \n"); +} + +/* osd_init() + */ +int osd_init(void) +{ + int err; + + /* lib init */ + lib_init(); + + /* FIXME: decode the cmdline */ + + /* Init the signal handlers */ + err = osd_init_signal(); + if (err) + goto out_signal; + + /* FIXME: setup the timers */ + err = osd_setup_timers(); + if (err) + goto out_timers; + + /* init storage */ + err = osd_storage_init(); + if (err) + goto out_storage; + + /* FIXME: init the service threads' pool */ + err = osd_spool_create(); + if (err) + goto out_spool; + + /* init the aio threads */ + //err = osd_aio_create(); + if (err) + goto out_aio; + + /* mask the SIGUSR1 signal for main thread */ + { + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + pthread_sigmask(SIG_BLOCK, &set, NULL); + } + + /* ok to run */ + hoo.state = HOO_STATE_LAUNCH; + +out_aio: +out_spool: +out_storage: +out_timers: +out_signal: + return err; +} + +void osd_destroy(void) +{ + hvfs_verbose(osd, "OK, stop it now...\n"); + + /* unreg w/ the r2 server */ + if (hoo.cb_exit) { + hoo.cb_exit(&hoo); + } + + /* stop the timer thread */ + hoo.timer_thread_stop = 1; + if (hoo.timer_thread) + pthread_join(hoo.timer_thread, NULL); + + sem_destroy(&hoo.timer_sem); + + /* destroy the service threads' pool */ + osd_spool_destroy(); + + /* destroy the storage */ + osd_storage_destroy(); + + /* you should wait for the storage destroied and exit the AIO threads */ + //osd_aio_destroy(); + + /* finally, we write our finish flag to objlog file */ + osd_exit_normal(); +} + +u64 osd_select_ring(struct hvfs_osd_object *hoo) +{ + if (hoo->ring_site) + return hoo->ring_site; + else + return HVFS_RING(0); +} + +void osd_set_ring(u64 site_id) +{ + hoo.ring_site = site_id; +} + +int osd_addr_table_update(struct xnet_msg *msg) +{ + if (msg->xm_datacheck) { + if (hoo.cb_addr_table_update) + hoo.cb_addr_table_update(msg->xm_data); + } else { + hvfs_err(osd, "Invalid addr table update message, incomplete hst!\n"); + return -EINVAL; + } + + xnet_free_msg(msg); + + return 0; +} diff --git a/osd/osd.h b/osd/osd.h new file mode 100644 index 0000000..7a1a296 --- /dev/null +++ b/osd/osd.h @@ -0,0 +1,221 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-14 17:24:29 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __OSD_H__ +#define __OSD_H__ + +#include "hvfs.h" +#include "obj.h" +#include "xnet.h" +#include "lprof.h" +#include "root.h" +#include "osd_config.h" + +#ifdef HVFS_TRACING +extern u32 hvfs_osd_tracing_flags; +#endif + +#define HVFS_OSD_HOME "/tmp/hvfs" + +#define OSD_DEFAULT_PREFIX_LEN 4 + +struct log_manager +{ + atomic_t addnr; + atomic_t delnr; + struct list_head add; + struct list_head del; + xlock_t add_lock; + xlock_t del_lock; +}; + +struct scan_manager +{ + struct list_head head; + xlock_t lock; +}; + +struct osd_storage +{ + xlock_t objlog_fd_lock; /* obj log file's lock */ + int objlog_fd; /* obj log file FD */ + struct log_manager lm; /* obj add/del in current session */ + struct scan_manager sm; +}; + +struct osd_conf +{ + /* section for dynamic configuration */ + char dcaddr[OSD_DCONF_MAX_NAME_LEN]; + int dcfd, dcepfd; + pthread_t dcpt; + + /* section for file name */ + char *osd_home; + char *profiling_file; + char *conf_file; + char *log_file; /* log file(HOME/log) for osd add/del */ + + /* section for file id */ + FILE *pf_file, *cf_file, *lf_file; + + /* # of threads */ + /* NOTE: # of profiling thread is always ONE */ + int spool_threads; /* # of service threads */ + int aio_threads; /* # of io threads */ + + /* misc configs */ + int stacksize; + +#define OSD_PROF_NONE 0x00 +#define OSD_PROF_PLOT 0x01 +#define OSD_PROF_HUMAN 0x02 +#define OSD_PROF_R2 0x03 + u8 prof_plot; /* do we dump profilings for gnuplot */ + + /* intervals */ + int profiling_thread_interval; + int hb_interval; + + /* conf */ +#define HVFS_OSD_WDROP 0x01 /* drop all the writes to this OSD */ + u64 option; +}; + +struct hvfs_osd_object +{ + u64 site_id; /* this site */ + struct xnet_context *xc; + + struct osd_storage storage; + struct osd_prof prof; + struct osd_conf conf; +#define HOO_STATE_INIT 0x00 +#define HOO_STATE_LAUNCH 0x01 +#define HOO_STATE_RUNNING 0x02 +#define HOO_STATE_PAUSE 0x03 +#define HOO_STATE_RDONLY 0x04 +#define HOO_STATE_OFFLINE 0x05 + u32 state; + +#define HMO_AUX_STATE_RECOVERY 0x01 + u32 aux_state; + + u64 ring_site; + time_t tick; /* tick of this OSD */ + + /* the following region is used for threads */ + sem_t timer_sem; /* for timer thread wakeup */ + atomic64_t pending_ios; /* pending IOs */ + + pthread_t timer_thread; + pthread_t *spool_thread; /* array of service threads */ + pthread_t *aio_thread; /* array of aio threads */ + + /* osd profiling array */ + struct hvfs_profile hp; + + u32 timer_thread_stop:1; /* running flag for timer thread */ + u32 spool_thread_stop:1; /* running flag for service thread */ + u32 aio_thread_stop:1; /* running flag for aio thread */ + + /* callback funcitons */ + void (*cb_exit)(void *); + void (*cb_hb)(void *); + void (*cb_addr_table_update)(void *); +}; + +extern struct hvfs_osd_info hoi; +extern struct hvfs_osd_object hoo; +extern atomic_t obj_reads; + +#define LOG_BEGIN_MAGIC 0x32fce973 +#define LOG_END_MAGIC 0x23cf9c7f +#define LOG_ENTRY_MAGIC 0x7913c94f +struct log_entry +{ + u32 magic; + u32 session; + union + { + struct + { + struct objid id; + u64 site_id:63; + u64 add_or_del:1; /* there is no read/write log entry! */ + } _entry; + struct + { + u32 addnr; + u32 delnr; + } _end; + }; + u64 ts; /* time stamp */ +}; + +/* APIs */ +int osd_config(void); +void osd_help(void); + +void osd_pre_init(void); +int osd_init(void); +int osd_verify(void); + +void osd_reset_itimer(void); + +void osd_destroy(void); +u64 osd_select_ring(struct hvfs_osd_object *); +void osd_set_ring(u64); +int osd_addr_table_update(struct xnet_msg *); + +/* prof.c */ +void osd_dump_profiling(time_t t, struct hvfs_profile *hp); + +/* storage.c */ +int osd_storage_dir_make_exist(char *path); +int osd_storage_init(void); +void osd_storage_destroy(void); +void osd_startup_normal(void); +void osd_exit_normal(void); + +/* spool.c */ +int osd_spool_dispatch(struct xnet_msg *); +void osd_spool_redispatch(struct xnet_msg *, int); + +/* the follwing are marker types */ +#define OSD_MRK_PAUSE 0x01 +#define OSD_MRK_RDONLY 0x02 +#define OSD_MRK_OFFLINE 0x03 +#define OSD_CLR_PAUSE 0xf1 +#define OSD_CLR_RDONLY 0xf2 +#define OSD_CLR_OFFLINE 0xf3 +int osd_set_marker(u32 type); +int osd_clr_marker(u32 type); + +int osd_spool_create(void); +void osd_spool_destroy(void); + +/* dispatch.c */ +int osd_dispatch(struct xnet_msg *); + +#endif diff --git a/osd/osd_config.h b/osd/osd_config.h new file mode 100644 index 0000000..a50a573 --- /dev/null +++ b/osd/osd_config.h @@ -0,0 +1,81 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-07 14:39:51 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __OSD_CONFIG_H__ +#define __OSD_CONFIG_H__ + +#define HVFS_OSD_GET_ENV_strncpy(name, value, len) do { \ + (value) = getenv("hvfs_osd_" #name); \ + if (value) { \ + strncpy(hoo.conf.name, value, len); \ + } \ + } while (0) + +#define HVFS_OSD_GET_ENV_cpy(name, value) do { \ + (value) = getenv("hvfs_osd_" #name); \ + if (value) { \ + hoo.conf.name = value; \ + } \ + } while (0) + +#define HVFS_OSD_GET_ENV_atoi(name, value) do { \ + (value) = getenv("hvfs_osd_" #name); \ + if (value) { \ + hoo.conf.name = atoi(value); \ + } \ + } while (0) + +#define HVFS_OSD_GET_ENV_atol(name, value) do { \ + (value) = getenv("hvfs_osd_" #name); \ + if (value) { \ + hoo.conf.name = atol(value); \ + } \ + } while (0) + +#define HVFS_OSD_GET_ENV_option(name, uname, value) do { \ + (value) = getenv("hvfs_osd_opt_" #name); \ + if (value) { \ + if (atoi(value) != 0) { \ + hoo.conf.option |= HVFS_OSD_##uname; \ + } \ + } \ + } while (0) + +#define HVFS_OSD_GET_kmg(name, value) do { \ + double base; \ + char *p; \ + (value) = getenv("hvfs_osd_" #name); \ + if (value) { \ + base = strtod(value, &p); \ + if (*p == 'g' || *p == 'G') { \ + base *= 1024 * 1024 * 1024; \ + } else if (*p == 'm' || *p == 'M') { \ + base *= 1024 * 1024; \ + } else if (*p == 'k' || *p == 'K') { \ + base *= 1024; \ + } \ + hoo.conf.name = (u64)base; \ + } \ + } while (0) + +#endif diff --git a/osd/prof.c b/osd/prof.c new file mode 100644 index 0000000..92813de --- /dev/null +++ b/osd/prof.c @@ -0,0 +1,227 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-08 11:04:27 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "osd.h" +#include "lprof.h" + +static inline +void dump_profiling_r2(time_t t, struct hvfs_profile *hp) +{ + int i = 0; + + if (!hoo.conf.profiling_thread_interval) + return; + if (t < hoo.prof.ts + hoo.conf.profiling_thread_interval) { + return; + } + hoo.prof.ts = t; + hp->flag |= HP_UP2DATE; + + HVFS_PROFILE_VALUE_ADDIN(hp, i, t); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.client.objrnr)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.client.objwnr)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.client.objrbytes)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.client.objwbytes)); + + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.ring.update)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.ring.size)); + + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.mdsl.objrnr)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.mdsl.objwnr)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.mdsl.objrbytes)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.mdsl.objwbytes)); + + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.misc.reqin_total)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.misc.reqin_handle)); + + HVFS_PROFILE_VALUE_ADDIN(hp, i, hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->msg_alloc) : 0); + HVFS_PROFILE_VALUE_ADDIN(hp, i, hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->msg_free) : 0); + HVFS_PROFILE_VALUE_ADDIN(hp, i, hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->inbytes) : 0); + HVFS_PROFILE_VALUE_ADDIN(hp, i, hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->outbytes) : 0); + HVFS_PROFILE_VALUE_ADDIN(hp, i, hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->active_links) : 0); + + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.storage.wbytes)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.storage.rbytes)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.storage.wreq)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.storage.rreq)); + HVFS_PROFILE_VALUE_ADDIN(hp, i, atomic64_read(&hoo.prof.storage.cpbytes)); + + hp->nr = i; + + /* send the request to R2 server now */ + { + static struct hvfs_profile ghp = {.nr = 0,}; + struct hvfs_profile diff; + struct xnet_msg *msg; + u64 dsite; + int err = 0, i; + + if (!ghp.nr) { + diff = ghp = *hp; + /* reset time stamp to ZERO */ + diff.hpv[0].value = 0; + } else { + diff = *hp; + for (i = 0; i < hp->nr; i++) { + diff.hpv[i].value -= ghp.hpv[i].value; + } + ghp = *hp; + } + + /* reset the flag now */ + hp->flag &= (~HP_UP2DATE); + + /* prepare the xnet_msg */ + msg = xnet_alloc_msg(XNET_MSG_NORMAL); + if (!msg) { + hvfs_err(osd, "xnet_alloc_msg() failed.\n"); + err = -ENOMEM; + goto out; + } + + /* send this profile to r2 server */ + dsite = osd_select_ring(&hoo); + xnet_msg_fill_tx(msg, XNET_MSG_REQ, 0, hoo.site_id, dsite); + xnet_msg_fill_cmd(msg, HVFS_R2_PROFILE, 0, 0); +#ifdef XNET_EAGER_WRITEV + xnet_msg_add_sdata(msg, &msg->tx, sizeof(msg->tx)); +#endif + xnet_msg_add_sdata(msg, &diff, sizeof(diff)); + + err = xnet_send(hoo.xc, msg); + if (err) { + hvfs_err(osd, "Profile request to R2(%lx) failed w/ %d\n", + dsite, err); + goto out_free_msg; + } + out_free_msg: + xnet_free_msg(msg); + } + +out: + return; +} + +static inline +void dump_profiling_plot(time_t t) +{ + if (!hoo.conf.profiling_thread_interval) + return; + if (t < hoo.prof.ts + hoo.conf.profiling_thread_interval) { + return; + } + hoo.prof.ts = t; + /* the output format is : + * + * "timestamp client.objrnr, client.objwnr, client.objrbytes, + * client.objwbytes, ring.update, ring.size, mdsl.objrnr, mdsl.objwnr, + * mdsl.objrbytes, mdsl.objwbytes, misc.reqin_total, misc.reqin_handle, + * xnet.msg_alloc, xnet.msg_free, xnet.inbytes, xnet.outbytes, + * xnet.active_links, storage.wbytes, storage.rbytes, storage.wreq, + * storage.rreq, storage.cpbytes + * + * Note that, we send this profile header to r2 server. If you are + * modifying this header, please make sure modify the defination in + * root/profile.c -> hvfs_osd_profile_setup()! + */ + hvfs_pf("PLOT %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld " + "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", + t, + atomic64_read(&hoo.prof.client.objrnr), + atomic64_read(&hoo.prof.client.objwnr), + atomic64_read(&hoo.prof.client.objrbytes), + atomic64_read(&hoo.prof.client.objwbytes), + atomic64_read(&hoo.prof.ring.update), + atomic64_read(&hoo.prof.ring.size), + atomic64_read(&hoo.prof.mdsl.objrnr), + atomic64_read(&hoo.prof.mdsl.objwnr), + atomic64_read(&hoo.prof.mdsl.objrbytes), + atomic64_read(&hoo.prof.mdsl.objwbytes), + atomic64_read(&hoo.prof.misc.reqin_total), + atomic64_read(&hoo.prof.misc.reqin_handle), + (hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->msg_alloc) : 0), + (hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->msg_free) : 0), + (hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->inbytes) : 0), + (hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->outbytes) : 0), + (hoo.prof.xnet ? + atomic64_read(&hoo.prof.xnet->active_links) : 0), + atomic64_read(&hoo.prof.storage.wbytes), + atomic64_read(&hoo.prof.storage.rbytes), + atomic64_read(&hoo.prof.storage.wreq), + atomic64_read(&hoo.prof.storage.rreq), + atomic64_read(&hoo.prof.storage.cpbytes) + ); +} + +static inline +void dump_profiling_human(time_t t) +{ + if (!hoo.conf.profiling_thread_interval) + return; + if (t < hoo.prof.ts + hoo.conf.profiling_thread_interval) { + return; + } + hoo.prof.ts = t; + if (hoo.prof.xnet) { + hvfs_info(osd, "%16ld | XNET Prof: alloc %ld, free %ld, inb %ld, " + "outb %ld, links %ld\n", t, + atomic64_read(&hoo.prof.xnet->msg_alloc), + atomic64_read(&hoo.prof.xnet->msg_free), + atomic64_read(&hoo.prof.xnet->inbytes), + atomic64_read(&hoo.prof.xnet->outbytes), + atomic64_read(&hoo.prof.xnet->active_links)); + } + hvfs_info(osd, "%16ld -- MISC Prof: reqin_total %ld, reqin_handle %ld\n", + t, + atomic64_read(&hoo.prof.misc.reqin_total), + atomic64_read(&hoo.prof.misc.reqin_handle)); +} + +void osd_dump_profiling(time_t t, struct hvfs_profile *hp) +{ + switch (hoo.conf.prof_plot) { + case OSD_PROF_PLOT: + dump_profiling_plot(t); + break; + case OSD_PROF_HUMAN: + dump_profiling_human(t); + break; + case OSD_PROF_R2: + /* always send the current profiling copy to HVFS_RING(0)? */ + dump_profiling_r2(t, hp); + break; + case OSD_PROF_NONE: + default:; + } +} diff --git a/osd/spool.c b/osd/spool.c new file mode 100644 index 0000000..0a51a6c --- /dev/null +++ b/osd/spool.c @@ -0,0 +1,337 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-07 14:46:04 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "osd.h" +#include "ring.h" +#include "lib.h" + +struct spool_mgr +{ + struct list_head reqin; + xlock_t rin_lock; + sem_t rin_sem; +#define OSD_CMD_PAUSE 0x01 /* is pause now (drop all messages) */ +#define OSD_CMD_RDONLY 0x02 /* drop all modify messages */ +#define OSD_CMD_OFFLINE 0x04 /* drop all but R2 messages */ +#define OSD_CMD_MASK 0x0f + u32 flags; +}; + +#define OSD_IS_PAUSED(mgr) ((mgr).flags & OSD_CMD_PAUSE) +#define OSD_IS_RDONLY(mgr) ((mgr).flags & OSD_CMD_RDONLY) +#define OSD_IS_OFFLINE(mgr) ((mgr).flags & OSD_CMD_OFFLINE) + +struct spool_thread_arg +{ + int tid; +}; + +static struct spool_mgr spool_mgr; + +int osd_spool_dispatch(struct xnet_msg *msg) +{ + xlock_lock(&spool_mgr.rin_lock); + list_add_tail(&msg->list, &spool_mgr.reqin); + xlock_unlock(&spool_mgr.rin_lock); + atomic64_inc(&hoo.prof.misc.reqin_total); + sem_post(&spool_mgr.rin_sem); + + return 0; +} + +void osd_spool_redispatch(struct xnet_msg *msg, int sempost) +{ + xlock_lock(&spool_mgr.rin_lock); + list_add_tail(&msg->list, &spool_mgr.reqin); + xlock_unlock(&spool_mgr.rin_lock); + if (sempost) + sem_post(&spool_mgr.rin_sem); +} + +static inline +int osd_dispatch_check(struct xnet_msg *msg) +{ + if (msg->tx.cmd == HVFS_OSD_READ) { + if (atomic_inc_return(&obj_reads) >= hoo.conf.spool_threads) { + atomic_dec(&obj_reads); + return 1; + } + } + + return 0; +} + +static inline +int osd_is_marker(struct xnet_msg *msg) +{ + if (msg->tx.type == XNET_MSG_CMD) + return 1; + return 0; +} + +static inline +void osd_update_marker(struct xnet_msg *msg, struct spool_mgr *mgr) +{ + switch (msg->tx.cmd) { + case OSD_MRK_PAUSE: + mgr->flags |= OSD_CMD_PAUSE; + break; + case OSD_MRK_RDONLY: + mgr->flags |= OSD_CMD_RDONLY; + break; + case OSD_MRK_OFFLINE: + mgr->flags |= OSD_CMD_OFFLINE; + break; + case OSD_CLR_PAUSE: + mgr->flags &= (~OSD_CMD_PAUSE); + break; + case OSD_CLR_RDONLY: + mgr->flags &= (~OSD_CMD_RDONLY); + break; + case OSD_CLR_OFFLINE: + mgr->flags &= (~OSD_CMD_OFFLINE); + break; + default: + hvfs_warning(osd, "Invalid OSD Spool marker (%ld)\n", msg->tx.cmd); + } + /* free the msg now */ + xnet_raw_free_msg(msg); +} + +static inline +int __osd_set_marker(u32 type) +{ + struct xnet_msg *msg; + + msg = xnet_alloc_msg(XNET_MSG_NORMAL); + if (!msg) { + hvfs_err(osd, "xnet_alloc_msg() failed\n"); + /* do not retry myself */ + return -ENOMEM; + } + xnet_msg_fill_tx(msg, XNET_MSG_CMD, 0, hoo.site_id, hoo.site_id); + + /* insert the message to reqin queue */ + osd_spool_dispatch(msg); + + return 0; +} + +int osd_set_marker(u32 type) +{ + return __osd_set_marker(type); +} + +int osd_clr_marker(u32 type) +{ + return __osd_set_marker(type); +} + +static inline +int osd_marked(struct spool_mgr *mgr) +{ + return mgr->flags & OSD_CMD_MASK; +} + +/* + * Return value: 1=>filtered; 0=>not_filtered + */ +static int osd_filter_msg(struct xnet_msg *msg) +{ + if (OSD_IS_PAUSED(spool_mgr)) { + /* drop all messages */ + xnet_free_msg(msg); + return 1; + } + if (OSD_IS_RDONLY(spool_mgr)) { + /* drop modify messages */ + if (msg->tx.cmd == HVFS_OSD_WRITE) { + xnet_free_msg(msg); + return 1; + } + } + if (OSD_IS_OFFLINE(spool_mgr)) { + /* drop all but RING messages */ + if (!HVFS_IS_RING(msg->tx.ssite_id)) { + xnet_free_msg(msg); + return 1; + } + } + + return 0; +} + +static inline +int __serv_request(void) +{ + struct xnet_msg *msg = NULL, *pos, *n; + + xlock_lock(&spool_mgr.rin_lock); + list_for_each_entry_safe(pos, n, &spool_mgr.reqin, list) { + list_del_init(&pos->list); + msg = pos; + break; + } + xlock_unlock(&spool_mgr.rin_lock); + + if (!msg) + return -EHSTOP; + + /* check if this request can be dealed right now */ + if (osd_dispatch_check(msg)) { + /* reinsert the request to the queue */ + xlock_lock(&spool_mgr.rin_lock); + list_add_tail(&msg->list, &spool_mgr.reqin); + xlock_unlock(&spool_mgr.rin_lock); + sem_post(&spool_mgr.rin_sem); + + return 0; + } + + /* check if we should handle the following requests */ + if (osd_is_marker(msg)) { + osd_update_marker(msg, &spool_mgr); + return 0; + } + if (osd_marked(&spool_mgr)) { + /* filter the msg now */ + if (osd_filter_msg(msg)) { + return 0; + } + } + + /* ok, deal with it, we just calling the secondary dispatcher */ + ASSERT(msg->xc, osd); + ASSERT(msg->xc->ops.dispatcher, osd); + atomic64_inc(&hoo.prof.misc.reqin_handle); + return msg->xc->ops.dispatcher(msg); +} + +static +void *spool_main(void *arg) +{ + struct spool_thread_arg *sta = (struct spool_thread_arg *)arg; + sigset_t set; + int err = 0; + + /* first, let us block the SIGALRM and SIGCHLD */ + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigaddset(&set, SIGCHLD); + pthread_sigmask(SIG_BLOCK, &set, NULL); /* oh, we do not care about the + * errs */ + while (!hoo.spool_thread_stop) { + err = sem_wait(&spool_mgr.rin_sem); + if (err == EINTR) + continue; + hvfs_debug(osd, "Service thread %d wakeup to handle the requests.\n", + sta->tid); + /* trying to handle more and more requsts. */ + while (1) { + err = __serv_request(); + if (err == -EHSTOP) + break; + else if (err) { + hvfs_err(osd, "Service thread handle request w/ error %d\n", + err); + break; + } + } + } + pthread_exit(0); +} + +int osd_spool_create(void) +{ + pthread_attr_t attr; + struct spool_thread_arg *sta; + int i, err = 0, stacksize; + + /* init the thread stack size */ + err = pthread_attr_init(&attr); + if (err) { + hvfs_err(osd, "Init pthread attr failed\n"); + goto out; + } + stacksize = (hoo.conf.stacksize > (1 << 20) ? + hoo.conf.stacksize : (2 << 20)); + err = pthread_attr_setstacksize(&attr, stacksize); + if (err) { + hvfs_err(osd, "set thread stack size to %d failed w/ %d\n", + stacksize, err); + goto out; + } + + /* init the mgr struct */ + memset(&spool_mgr, 0, sizeof(spool_mgr)); + INIT_LIST_HEAD(&spool_mgr.reqin); + xlock_init(&spool_mgr.rin_lock); + sem_init(&spool_mgr.rin_sem, 0, 0); + + /* init service threads' pool */ + if (!hoo.conf.spool_threads) + hoo.conf.spool_threads = 4; + + hoo.spool_thread = xzalloc(hoo.conf.spool_threads * sizeof(pthread_t)); + if (!hoo.spool_thread) { + hvfs_err(osd, "xzalloc() pthread_t failed\n"); + return -ENOMEM; + } + + sta = xzalloc(hoo.conf.spool_threads * sizeof(struct spool_thread_arg)); + if (!sta) { + hvfs_err(osd, "xzalloc() struct spool_thread_arg failed\n"); + err = -ENOMEM; + goto out_free; + } + + for (i = 0; i < hoo.conf.spool_threads; i++) { + (sta + i)->tid = i; + err = pthread_create(hoo.spool_thread + i, &attr, &spool_main, + sta + i); + if (err) + goto out; + } + +out: + return err; +out_free: + xfree(hoo.spool_thread); + goto out; +} + +void osd_spool_destroy(void) +{ + int i; + + hoo.spool_thread_stop = 1; + for (i = 0; i < hoo.conf.spool_threads; i++) { + sem_post(&spool_mgr.rin_sem); + } + for (i = 0; i < hoo.conf.spool_threads; i++) { + pthread_join(*(hoo.spool_thread + i), NULL); + } + sem_destroy(&spool_mgr.rin_sem); +} diff --git a/osd/storage.c b/osd/storage.c new file mode 100644 index 0000000..8ee0e8b --- /dev/null +++ b/osd/storage.c @@ -0,0 +1,374 @@ +/** + * Copyright (c) 2009 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-14 17:26:42 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "osd.h" +#include "lib.h" + +static u32 g_session = 0; + +int osd_storage_dir_make_exist(char *path) +{ + int err; + + err = mkdir(path, 0755); + if (err) { + err = -errno; + if (errno == EEXIST) { + err = 0; + } else if (errno == EACCES) { + hvfs_err(osd, "Failed to create the dir %s, no permission.\n", + path); + } else { + hvfs_err(osd, "mkdir %s failed w/ %d\n", path, errno); + } + } + + return err; +} + +/* calculate the prefix from objid + */ +void osd_get_prefix(struct objid oid, char *prefix) +{ + MD5_CTX mdContext; + int idx[OSD_DEFAULT_PREFIX_LEN] = {1, 7, 11, 13, }, i; + + MD5Init (&mdContext); + MD5Update (&mdContext, &oid, sizeof(oid.uuid) + sizeof(oid.bid)); + MD5Final (&mdContext); + + for (i = 0; i < OSD_DEFAULT_PREFIX_LEN; i++) { + snprintf(&prefix[i << 1], 3, "%02x", mdContext.digest[idx[i]]); + } + prefix[OSD_DEFAULT_PREFIX_LEN << 1] = '\0'; +} + +void osd_get_obj_path(struct objid oid, char *path) +{ + char prefix[2 * OSD_DEFAULT_PREFIX_LEN + 1]; + + memset(prefix, 0, sizeof(prefix)); + osd_get_prefix(oid, prefix); + /* NOTE: + * + * if OSD_DEFAULT_PREFIX_LEN != 4, then fail! (HOW TO FIX: change 0.4s to + * sth else) + */ + ASSERT(OSD_DEFAULT_PREFIX_LEN == 4, osd); + sprintf(path, "%s/%.4s/%s/%lx.%x", hoo.conf.osd_home, prefix, + &prefix[OSD_DEFAULT_PREFIX_LEN], + oid.uuid, oid.bid); +} + +int osd_log_integrated(void) +{ + struct log_entry le; + loff_t offset = 0; + u64 begin_session = 0, end_session = 0; + int err = -ENOENT, bl, br; + + /* read in the content from last checkpoint position */ + do { + /* get the log_entry */ + bl = 0; + do { + br = pread(hoo.storage.objlog_fd, (void *)&le + bl, + sizeof(le) - bl, offset + bl); + if (br < 0) { + hvfs_err(osd, "read objlog file failed w/ %d offset %ld\n", + errno, offset + bl); + err = -errno; + goto out; + } else if (br == 0) { + /* it is ok to break here */ + goto out_check; + } + bl += br; + } while (bl < sizeof(le)); + + if (le.magic == LOG_BEGIN_MAGIC) { + begin_session = le.session; + } else if (le.magic == LOG_END_MAGIC) { + end_session = le.session; + } + offset += sizeof(le); + } while (1); + +out_check: + if (begin_session == end_session) { + if (end_session) + err = 0; + else if (!begin_session) { + /* there is no session pair */ + err = 0; + } else { + err = -ENOENT; + } + } +out: + if (err) { + hvfs_warning(osd, "OSD objlog integrated check failed w/ %d(%s)\n", + err, strerror(-err)); + } + + return err; +} + +int osd_log_redo(void) +{ + struct log_entry le; + loff_t offset = 0; + int err = -ENOENT, bl, br; + + /* read in the content from last checkpoint position */ + do { + /* get the log_entry */ + bl = 0; + do { + br = pread(hoo.storage.objlog_fd, (void *)&le + bl, + sizeof(le) - bl, offset + bl); + if (br < 0) { + hvfs_err(osd, "read objlog file failed w/ %d offset %ld\n", + errno, offset + bl); + err = -errno; + goto out; + } else if (br == 0) { + /* it is ok to break here */ + goto out; + } + bl += br; + } while (bl < sizeof(le)); + + if (le.magic == LOG_ENTRY_MAGIC) { + /* check if the ENTRY exists */ + /* FIXME: + * construct two lists: one for add, the other for del. + */ + } + offset += sizeof(le); + } while (1); + +out: + if (err) { + hvfs_warning(osd, "OSD objlog redo failed w/ %d\n", err); + } + + return err; +} + +int osd_storage_is_clean() +{ + char path[256] = {0,}; + int err = 0; + + /* try to open the log file */ + sprintf(path, "%s/%lx/objlog", hoo.conf.osd_home, hoo.site_id); + + hoo.storage.objlog_fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (hoo.storage.objlog_fd < 0) { + hvfs_err(osd, "open() objlog file %s failed %d (%s)\n", + path, errno, strerror(errno)); + err = -errno; + goto out; + } + + /* check if the log file is integrated */ + err = osd_log_integrated(); + if (err) { + hvfs_err(osd, "objlog file %s is NOT integrated.\n", + path); + } + + hvfs_info(osd, "Begin redo the logs ...\n"); + err = osd_log_redo(); + if (err) { + hvfs_err(osd, "objlog file %s redo failed w/ %d\n", + path, err); + goto out; + } + + hvfs_info(osd, "objlog file is CLEAN!\n"); + +out: + return err; +} + +void __osd_log_rename(void) +{ + char opath[256], npath[256]; + int err = 0; + + sprintf(opath, "%s/%lx/objlog", hoo.conf.osd_home, hoo.site_id); + sprintf(npath, "%s/%lx/last-objlog", hoo.conf.osd_home, hoo.site_id); + + err = rename(opath, npath); + if (err) { + hvfs_err(osd, "rename objlog to last-objlog failed w/ %d\n", + errno); + goto out; + } + + /* close old file and open new file */ + err = open(opath, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (err < 0) { + hvfs_err(osd, "open file '%s' failed w/ %d\n", opath, errno); + } + close(hoo.storage.objlog_fd); + hoo.storage.objlog_fd = err; + +out: + return; +} + +void __osd_log_pair_write(struct log_entry *le) +{ + loff_t offset; + long bw, bl; + + xlock_lock(&hoo.storage.objlog_fd_lock); + offset = lseek(hoo.storage.objlog_fd, 0, SEEK_END); + if (offset < 0) { + hvfs_err(osd, "lseek to end of fd %d failed w/ %d\n", + hoo.storage.objlog_fd, errno); + goto out_unlock; + } + /* write the LOG_ENTRY */ + bl = 0; + do { + bw = pwrite(hoo.storage.objlog_fd, (void *)le + bl, + sizeof(*le) - bl, offset + bl); + if (bw <= 0) { + hvfs_err(osd, "pwrite to fd %d failed w/ %d\n", + hoo.storage.objlog_fd, errno); + goto out_unlock; + } + bl += bw; + } while (bl < sizeof(*le)); + offset += sizeof(*le); + +out_unlock: + xlock_unlock(&hoo.storage.objlog_fd_lock); +} + +/* write a magic pair begin in log file */ +void osd_startup_normal(void) +{ + struct log_entry lb; + + memset(&lb, 0, sizeof(lb)); + lb.magic = LOG_BEGIN_MAGIC; + + /* set up specific info */ + lb.ts = (u64)time(NULL); + lb.session = g_session; + + /* change to a new log file */ + __osd_log_rename(); + + /* do write */ + __osd_log_pair_write(&lb); +} + +/* Write a magic pair end in log file + */ +void osd_exit_normal(void) +{ + struct log_entry le; + + memset(&le, 0, sizeof(le)); + le.magic = LOG_END_MAGIC; + le.session = g_session; + + /* set up session info */ + le._end.addnr = atomic_read(&hoo.storage.lm.addnr); + le._end.delnr = atomic_read(&hoo.storage.lm.delnr); + le.ts = (u64)time(NULL); + + __osd_log_pair_write(&le); +} + +int osd_storage_init(void) +{ + char path[256] = {0,}; + int err = 0; + + /* set the fd limit firstly */ + struct rlimit rli = { + .rlim_cur = 65536, + .rlim_max = 70000, + }; + err = setrlimit(RLIMIT_NOFILE, &rli); + if (err) { + hvfs_err(osd, "setrlimit failed w/ %s\n", strerror(errno)); + hvfs_warning(osd, "%sStorage Server has FD limit! To overcome " + "this limit, please use a powerful UID to run this" + " process.%s\n", + HVFS_COLOR_RED, HVFS_COLOR_END); + } + + /* check the OSD site directory */ + sprintf(path, "%s/%lx", hoo.conf.osd_home, hoo.site_id); + err = osd_storage_dir_make_exist(path); + if (err) { + hvfs_err(osd, "dir %s do not exist.\n", path); + return -ENOTEXIST; + } + + /* setup the session id */ + g_session = lib_random(INT_MAX); + + /* setup the storage manager */ + xlock_init(&hoo.storage.objlog_fd_lock); + atomic_set(&hoo.storage.lm.addnr, 0); + atomic_set(&hoo.storage.lm.delnr, 0); + INIT_LIST_HEAD(&hoo.storage.lm.add); + INIT_LIST_HEAD(&hoo.storage.lm.del); + xlock_init(&hoo.storage.lm.add_lock); + xlock_init(&hoo.storage.lm.del_lock); + + INIT_LIST_HEAD(&hoo.storage.sm.head); + xlock_init(&hoo.storage.sm.lock); + + /* check whether this storage is clean */ + err = osd_storage_is_clean(); + if (err) { + hvfs_err(osd, "storage '%s' is_clean() failed w/ %d\n", + path, err); + goto out; + } + +out: + return err; +} + +void osd_storage_destroy(void) +{ + /* close the files */ + if (hoo.conf.lf_file) + fclose(hoo.conf.lf_file); + if (hoo.conf.pf_file) + fclose(hoo.conf.pf_file); +} + diff --git a/r2/Makefile b/r2/Makefile index 986b341..c37f770 100644 --- a/r2/Makefile +++ b/r2/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-07-20 14:13:39 macan> +# Time-stamp: <2012-08-10 14:00:04 macan> # # This is the makefile for HVFS project. # @@ -13,13 +13,13 @@ include ../Makefile.inc all : r2_lib %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(R2)/%.c, $<) r2_lib : $(R2_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libr2.a $(^:.c=.o) - @echo -e " " SL"\t" $(R2_SO) + @$(ECHO) -e " " SL"\t" $(R2_SO) @$(CC) -shared -Wl,-soname,libr2.so.1 -o $(LIB_PATH)/libr2.so.1.0 $(^:.c=.o) -lc -lrt -lpthread clean : diff --git a/r2/cli.c b/r2/cli.c index c337a95..2e719fe 100644 --- a/r2/cli.c +++ b/r2/cli.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-23 09:56:06 macan> + * Time-stamp: <2012-08-10 17:15:11 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -663,7 +663,7 @@ int cli_dynamic_del_site(struct ring_entry *re, u64 site_id, int force) struct xnet_group *cli_get_active_site(struct chring *r) { struct xnet_group *xg = NULL; - int i, err; + int i, __UNUSED__ err; for (i = 0; i < r->used; i++) { err = xnet_group_add(&xg, r->array[i].site_id); diff --git a/r2/dispatch.c b/r2/dispatch.c index 08237c9..d2cc4c8 100644 --- a/r2/dispatch.c +++ b/r2/dispatch.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-22 11:20:03 macan> + * Time-stamp: <2012-08-13 09:47:10 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -86,6 +86,12 @@ int root_dispatch(struct xnet_msg *msg) case HVFS_R2_INFO: err = root_do_info(msg); break; + case HVFS_R2_OREP: + err = root_do_objrep(msg); + break; + case HVFS_OSD_QUERY: + err = root_do_query_obj(msg); + break; default: hvfs_err(root, "R2 core dispatcher handle INVALID " "request <0x%lx %d>\n", diff --git a/r2/mgr.c b/r2/mgr.c index 115cd79..ddf2cd3 100644 --- a/r2/mgr.c +++ b/r2/mgr.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-06-29 06:16:58 macan> + * Time-stamp: <2012-08-08 11:01:26 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1403,7 +1403,7 @@ int root_mgr_lookup_create2(struct root_mgr *rm, u64 fsid, /* root_compact_hxi() * * This function compact the need info for a request client: - * mds/mdsl/client. The caller should supply the needed arguments. + * mds/mdsl/client/osd. The caller should supply the needed arguments. * * @site_id: requested site_id * @fsid: requested fsid @@ -1420,6 +1420,7 @@ int root_compact_hxi(u64 site_id, u64 fsid, u32 gid, union hvfs_x_info *hxi) return -EINVAL; if (HVFS_IS_CLIENT(site_id) | + HVFS_IS_OSD(site_id) | HVFS_IS_BP(site_id)) { /* we should reject if root->root_salt is -1UL */ /* Step 1: find site state in the site_mgr */ @@ -2004,7 +2005,9 @@ int root_read_hxi(u64 site_id, u64 fsid, union hvfs_x_info *hxi) * mkfs utility can create a new file system w/ a fsid. After reading the * root entry we can construct the site by ourself:) */ - if (HVFS_IS_MDS(site_id) || HVFS_IS_MDSL(site_id)) { + if (HVFS_IS_MDS(site_id) || + HVFS_IS_MDSL(site_id) || + HVFS_IS_OSD(site_id)) { err = root_mgr_lookup_create2(&hro.root, fsid, &root); if (err < 0) { hvfs_err(root, "lookup create entry %ld failed w/ %d\n", @@ -2143,6 +2146,25 @@ int root_read_hxi(u64 site_id, u64 fsid, union hvfs_x_info *hxi) goto out; } } + } else if (HVFS_IS_OSD(site_id)) { + struct hvfs_osd_info *holi = (struct hvfs_osd_info *)hxi; + + memcpy(hxi, &sd.hxi, sizeof(*holi)); + if (holi->gdt_salt != root->gdt_salt || + holi->root_salt != root->root_salt) { + hvfs_err(root, "Internal error, salt mismatch in holi and root\n"); + if (holi->root_salt == -1UL || + !holi->gdt_salt) { + holi->gdt_salt = root->gdt_salt; + holi->root_salt = root->root_salt; + } else if (holi->gdt_salt == root->gdt_salt) { + /* it means root changing, it is ok */ + holi->root_salt = root->root_salt; + } else { + err = -EFAULT; + goto out; + } + } } else if (HVFS_IS_AMC(site_id)) { struct hvfs_amc_info *ami = (struct hvfs_amc_info *)hxi; @@ -2256,6 +2278,16 @@ int root_create_hxi(struct site_entry *se) hci->root_uuid = root->root_uuid; hci->root_salt = root->root_salt; hci->group = se->gid; + } else if (HVFS_IS_OSD(se->site_id)) { + struct hvfs_osd_info *hoi = (struct hvfs_osd_info *)&se->hxi; + + memset(hoi, 0, sizeof(*hoi)); + hoi->state = HMI_STATE_CLEAN; + hoi->group = se->gid; + atomic64_set(&hoi->mi_bused, 0); + atomic64_set(&hoi->mi_bfree, -1UL); + atomic64_set(&hoi->mi_bwrite, 0); + atomic64_set(&hoi->mi_bread, 0); } else if (HVFS_IS_MDS(se->site_id)) { struct hvfs_mds_info *hmi = (struct hvfs_mds_info *)&se->hxi; diff --git a/r2/om.c b/r2/om.c new file mode 100644 index 0000000..9873121 --- /dev/null +++ b/r2/om.c @@ -0,0 +1,907 @@ +/** + * Copyright (c) 2012 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-14 14:01:46 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "root.h" +#include "xnet.h" +#include "obj.h" + +#define HVFS_OM_GET_ENV_atof(name, value) do { \ + (value) = getenv("hvfs_root_om_" #name); \ + if (value) { \ + om.conf.name = atof(value); \ + } \ + } while (0) + +#define HVFS_OM_GET_ENV_atoi(name, value) do { \ + (value) = getenv("hvfs_root_om_" #name); \ + if (value) { \ + om.conf.name = atoi(value); \ + } \ + } while (0) + +#define HVFS_OM_GET_ENV_option(name, uname, value) do { \ + (value) = getenv("hvfs_root_om_" #name); \ + if (value) { \ + if (atoi(value) != 0) { \ + om.conf.option |= HVFS_OM_OPTION_##uname; \ + } \ + } \ + } while (0) + +/* Defination of object active ratio: + * + * ratio = active_objs / total_objs + * + */ +struct hvfs_om_conf +{ +#define HVFS_OM_DEFAULT_ACTIVE_RATIO 0.99 + double active_ratio; /* object active ratio */ +#define HVFS_OM_DEFAULT_OBJ_HSIZE (1024 * 1024) /* 16M <-> 1PB */ + u32 obj_hsize; /* hash table size */ +#define HVFS_OM_DEFAULT_OSD_HSIZE 1024 + u32 osd_hsize; /* osd table size */ + +#define HVFS_OM_OPTION_STORE 0x01 + u32 option; +}; + +struct hvfs_obj_manager +{ + struct hvfs_om_conf conf; /* configurations of OM */ + atomic64_t total; /* total objs */ + atomic64_t active; /* active objs */ + struct regular_hash_rw *obj_tab; /* obj hash table */ + struct regular_hash_rw *osd_tab; /* osd hash table */ + +#define HVFS_OM_INIT 0x01 +#define HVFS_OM_SAFEMODE 0x02 +#define HVFS_OM_RUNNING 0x03 +#define HVFS_OM_BACKUP 0x04 + /* the state machine + * + * INIT -> SAFEMODE <-> RUNNING + * INIT -> BACKUP <-> RUNNING + */ + u32 state; /* state of the manager */ + + /* object report queue */ + struct list_head queue; + xlock_t qlock; + sem_t qsem; + pthread_t om_thread; + u32 om_thread_stop:1; +}; + +struct osd_array +{ + int size; +#if HVFS_SITE_MAX <= (1 << 32) +#define OSD_ARRAY_MOD u32 + u32 *site; +#else +#define OSD_ARRAY_MOD u64 + u64 *site; +#endif +#define OSD_ARRAY_UNIT (sizeof(OSD_ARRAY_MOD)) +}; + +struct objid_array +{ +#define OBJID_ARRAY_UNIT_MAX (1024 * 1024) + int psize, asize; + struct objid *obj; +}; + +struct obj_entry +{ + struct hlist_node hlist; /* insert to obj_tab */ + struct objid id; + atomic_t ref; + atomic_t lock; /* lock for sites' array */ + struct osd_array sites; +}; + +struct osd_entry +{ + struct hlist_node hlist; /* insert to osd_tab */ + u64 site_id; + atomic_t ref, lock; /* lock for objs' array */ + struct objid_array objs; +}; + +static struct hvfs_obj_manager om; +static u32 g_target_type; + +static inline +u32 __om_hash(struct objid id) +{ + u32 u1 = JSHash((char *)&id.uuid, sizeof(id.uuid)); + u32 u2 = RSHash((char *)&id.bid, sizeof(id.bid)); + + return u1 ^ u2; +} + +static inline +u32 __om_osd_hash(u64 site) +{ + return JSHash((char *)&site, sizeof(site)); +} + +/* API: add the message to OM.queue + */ +int om_dispatch_objrep(struct xnet_msg *msg) +{ + xlock_lock(&om.qlock); + list_add_tail(&msg->list, &om.queue); + xlock_unlock(&om.qlock); + atomic64_inc(&hro.prof.osd.objrep_recved); + sem_post(&om.qsem); + + return 0; +} + +/* each get_obj caller must call put_obj */ +static +struct obj_entry *om_get_obj(struct objid id) +{ + struct obj_entry *oe = NULL; + struct regular_hash_rw *rh; + struct hlist_node *pos; + int i; + + i = __om_hash(id); + rh = om.obj_tab + i; + + i = 0; + xrwlock_rlock(&rh->lock); + hlist_for_each_entry(oe, pos, &rh->h, hlist) { + if (OBJID_EQUAL(oe->id, id)) { + atomic_inc(&oe->ref); + i = 1; + break; + } + } + xrwlock_runlock(&rh->lock); + if (!i) + oe = NULL; + + return oe; +} + +/* each get_osd caller must call put_osd */ +static +struct osd_entry *om_get_osd(u64 site) +{ + struct osd_entry *osd = NULL; + struct regular_hash_rw *rh; + struct hlist_node *pos; + int i; + + i = __om_osd_hash(site); + rh = om.osd_tab + i; + + i = 0; + xrwlock_rlock(&rh->lock); + hlist_for_each_entry(osd, pos, &rh->h, hlist) { + if (osd->site_id == site) { + atomic_inc(&osd->ref); + i = 1; + break; + } + } + xrwlock_runlock(&rh->lock); + if (!i) + osd = NULL; + + return osd; +} + +static +void om_put_obj(struct obj_entry *oe) +{ + if (atomic_dec_return(&oe->ref) == 0) { + if (oe->sites.size > 0 && oe->sites.site) + xfree(oe->sites.site); + xfree(oe); + } +} + +static +void om_put_osd(struct osd_entry *osd) +{ + if (atomic_dec_return(&osd->ref) == 0) { + if (osd->objs.psize > 0 && osd->objs.obj) + xfree(osd->objs.obj); + xfree(osd); + } +} + +static +int __om_insert_obj(struct obj_entry *oe) +{ + struct regular_hash_rw *rh; + struct obj_entry *tpos; + struct hlist_node *pos; + int i; + + i = __om_hash(oe->id); + rh = om.obj_tab + i; + + i = 0; + xrwlock_wlock(&rh->lock); + hlist_for_each_entry(tpos, pos, &rh->h, hlist) { + if (OBJID_EQUAL(tpos->id, oe->id)) { + i = 1; + break; + } + } + if (!i) + hlist_add_head(&oe->hlist, &rh->h); + xrwlock_wunlock(&rh->lock); + + if (i) { + return -EEXIST; + } + atomic64_inc(&om.active); + + return 0; +} + +static +int __om_insert_osd(struct osd_entry *osd) +{ + struct regular_hash_rw *rh; + struct osd_entry *tpos; + struct hlist_node *pos; + int i; + + i = __om_osd_hash(osd->site_id); + rh = om.osd_tab + i; + + i = 0; + xrwlock_wlock(&rh->lock); + hlist_for_each_entry(tpos, pos, &rh->h, hlist) { + if (tpos->site_id == osd->site_id) { + i = 1; + break; + } + } + if (!i) + hlist_add_head(&osd->hlist, &rh->h); + xrwlock_wunlock(&rh->lock); + + if (i) + return -EEXIST; + + return 0; +} + +static +void __om_remove_obj(struct objid id) +{ + struct regular_hash_rw *rh; + struct obj_entry *tpos = NULL; + struct hlist_node *pos, *n; + int i; + + i = __om_hash(id); + rh = om.obj_tab + i; + + i = 0; + xrwlock_wlock(&rh->lock); + hlist_for_each_entry_safe(tpos, pos, n, &rh->h, hlist) { + if (OBJID_EQUAL(tpos->id, id)) { + if (atomic_read(&tpos->ref) > 0) { + /* someone is dealing with current obj_entry */ + i = 1; + } + hlist_del_init(&tpos->hlist); + atomic64_dec(&om.active); + break; + } + } + xrwlock_wunlock(&rh->lock); + + if (!i) + xfree(tpos); +} + +static +void __om_remove_osd(u64 site) +{ + struct regular_hash_rw *rh; + struct osd_entry *tpos = NULL; + struct hlist_node *pos, *n; + int i; + + i = __om_osd_hash(site); + rh = om.osd_tab + i; + + i = 0; + xrwlock_wlock(&rh->lock); + hlist_for_each_entry_safe(tpos, pos, n, &rh->h, hlist) { + if (tpos->site_id == site) { + if (atomic_read(&tpos->ref) > 0) { + /* someone is dealing with current osd_entry */ + i = 1; + } + hlist_del_init(&tpos->hlist); + break; + } + } + xrwlock_wunlock(&rh->lock); + + if (!i) + xfree(tpos); +} + +/* add_or_del: 1=>add; -1=>del + */ +static +void __om_update_obj(struct obj_entry *oe, u64 site, int add_or_del) +{ + struct osd_array new; + int found = 0, i; + + if (add_or_del != 1 && add_or_del != -1) + return; + + for (i = 0; i < oe->sites.size; i++) { + if (oe->sites.site[i] == site) { + found = 1; + break; + } + } + +retry: + if (atomic_inc_return(&oe->lock) > 1) { + atomic_dec(&oe->lock); + sched_yield(); + goto retry; + } + + if (add_or_del < 0) { + if (!found) { + hvfs_err(root, "Del site %lx from objid %lx+%d failed," + " not found.\n", + site, oe->id.uuid, oe->id.bid); + return; + } else { + /* exchange with the last entry */ + OSD_ARRAY_MOD tmp; + + tmp = oe->sites.site[i]; + oe->sites.site[i] = oe->sites.site[oe->sites.size - 1]; + oe->sites.site[oe->sites.size - 1] = tmp; + } + } + + new.size = oe->sites.size + add_or_del; + new.site = xrealloc(oe->sites.site, new.size * OSD_ARRAY_UNIT); + if (!new.site) { + hvfs_err(root, "OM update objid %lx+%d for site %lx failed," + " no free memory.\n", + oe->id.uuid, oe->id.bid, site); + goto out; + } + + if (add_or_del > 0) { + new.site[oe->sites.size] = (OSD_ARRAY_MOD)site; + } + oe->sites.size = new.size; + oe->sites.site = new.site; + +out: + atomic_dec(&oe->lock); +} + +static +int __osd_array_realloc(struct osd_entry *oe) +{ + struct objid_array new; + int err = 0; + + ASSERT(oe->objs.psize >= oe->objs.asize, root); + + if (oe->objs.asize == oe->objs.psize) { + /* enlarge the buffer */ + if (oe->objs.psize > OBJID_ARRAY_UNIT_MAX) { + new.psize = oe->objs.psize + OBJID_ARRAY_UNIT_MAX; + } else { + new.psize = (oe->objs.psize << 1); + } + if (new.psize <= 0) + new.psize = 1024; /* default to 1024 entries */ + + new.obj = xrealloc(oe->objs.obj, new.psize * sizeof(struct objid)); + if (!new.obj) { + hvfs_err(root, "OM enlarge obj array for site %lx failed," + " no free memory.\n", + oe->site_id); + goto out; + } + } else if (oe->objs.asize < (oe->objs.psize >> 1)) { + /* shrink the buffer */ + new.psize = (oe->objs.psize >> 1); + + new.obj = xrealloc(oe->objs.obj, new.psize * sizeof(struct objid)); + if (!new.obj) { + hvfs_err(root, "OM shrink obj array for site %lx failed, " + "ignore.\n", oe->site_id); + goto out; + } + } else { + /* no need to enlarge or shrink */ + goto out; + } + + oe->objs.psize = new.psize; + oe->objs.obj = new.obj; + +out: + return err; +} + +/* add_or_del: 1=>add; -1=>del + */ +static +void __om_update_osd(struct osd_entry *osd, struct objid id, int add_or_del) +{ + int found = 0, i, err; + + if (add_or_del != 1 && add_or_del != -1) + return; + + for (i = 0; i < osd->objs.asize; i++) { + if (OBJID_EQUAL(osd->objs.obj[i], id)) { + found = 1; + break; + } + } + +retry: + if (atomic_inc_return(&osd->lock) > 1) { + atomic_dec(&osd->lock); + sched_yield(); + goto retry; + } + + if (add_or_del < 0) { + if (!found) { + hvfs_err(root, "Del objid %lx+%d from site %lx failed," + " not found.\n", + id.uuid, id.bid, osd->site_id); + return; + } else { + /* exchange with the last entry */ + struct objid tmp; + + tmp = osd->objs.obj[i]; + osd->objs.obj[i] = osd->objs.obj[osd->objs.asize - 1]; + osd->objs.obj[osd->objs.asize - 1] = tmp; + } + } + + err = __osd_array_realloc(osd); + if (err && add_or_del > 0) { + goto out; + } + + if (add_or_del > 0) { + osd->objs.obj[osd->objs.asize] = id; + osd->objs.asize++; + } else { + osd->objs.asize--; + } + +out: + atomic_dec(&osd->lock); +} + +/* add a objid to obj hash table, w/o site info + */ +static int om_add_obj(struct objid id) +{ + struct obj_entry *oe; + int err = 0; + + oe = xzalloc(sizeof(*oe)); + if (!oe) { + hvfs_err(root, "unable to allocate a new object entry.\n"); + return -ENOMEM; + } + + INIT_HLIST_NODE(&oe->hlist); + atomic_set(&oe->ref, 0); + atomic_set(&oe->lock, 0); + oe->id = id; + + /* try to add it to hash table */ + err = __om_insert_obj(oe); + if (err == -EEXIST) { + xfree(oe); + } + + return err; +} + +/* add a osd to osd hash table, w/o objid info + */ +static int om_add_osd(u64 site) +{ + struct osd_entry *oe; + int err = 0; + + oe = xzalloc(sizeof(*oe)); + if (!oe) { + hvfs_err(root, "unable to allocate a new osd entry.\n"); + return -ENOMEM; + } + + INIT_HLIST_NODE(&oe->hlist); + atomic_set(&oe->ref, 0); + atomic_set(&oe->lock, 0); + oe->site_id = site; + + /* try to add it to hash table */ + err = __om_insert_osd(oe); + if (err == -EEXIST) { + xfree(oe); + } + + return err; +} + +/* add this site to the objid, if objid not existed, insert it first. + * And insert this site to the site hash table w/ the objid! + */ +static int om_add_obj_site(struct objid id, u64 site) +{ + struct obj_entry *oe; + struct osd_entry *osd; + int err = 0; + + /* Step 1: find and update the osd entry */ +retry0: + osd = om_get_osd(site); + if (osd) { + __om_update_osd(osd, id, 1); + om_put_osd(osd); + } else { + /* ok, create a new osd entry */ + err = om_add_osd(site); + if (err == -EEXIST) /* ignore EEXIST error */ + err = 0; + if (err) { + hvfs_err(root, "add new osd %lx failed w/ %d\n", + site, err); + return err; + } + goto retry0; + } + + /* try to find the object */ +retry: + oe = om_get_obj(id); + if (oe) { + /* ok, find it, then update it */ + __om_update_obj(oe, site, 1); + om_put_obj(oe); + } else { + /* ok, create a new obj entry */ + err = om_add_obj(id); + if (err == -EEXIST) /* ignore EEXIST error */ + err = 0; + if (err) { + hvfs_err(root, "add new obj %lx+%d failed w/ %d, leaving a " + "dangling obj in site table!\n", + id.uuid, id.bid, err); + return err; + } + goto retry; + } + + return err; +} + +/* delete the obj from obj_tab, for obj removing + */ +static int om_del_obj(struct objid id) +{ + __om_remove_obj(id); + + return 0; +} + +/* delete the site from object's osd array and id from osd's objid array + */ +static int om_del_obj_site(struct objid id, u64 site) +{ + struct obj_entry *oe; + struct osd_entry *osd; + + /* Step 1: delete from osd's objid array */ + osd = om_get_osd(site); + if (!osd) { + hvfs_warning(root, "Site %lx not found, continue deleting " + "from obj table.\n", site); + } else { + __om_update_osd(osd, id, -1); + om_put_osd(osd); + } + + /* Step 2: delete from object's osd array */ + oe = om_get_obj(id); + if (!oe) { + hvfs_err(root, "Object %lx+%d not found.\n", + id.uuid, id.bid); + return -ENOENT; + } + + __om_update_obj(oe, site, -1); + + om_put_obj(oe); + + return 0; +} + +/* delete the osd from osd_tab, for osd removing + * + * Note: for osd removing, we have to remove all the registered objects for + * this OSD. + */ +static int om_del_osd(u64 site) +{ + struct osd_entry *oe; + int i, err; + + oe = om_get_osd(site); + if (!oe) { + hvfs_err(root, "Find site %lx in hash table failed!\n", site); + return -ENOENT; + } + + for (i = 0; i < oe->objs.asize; i++) { + err = om_del_obj_site(oe->objs.obj[i], site); + if (err) { + hvfs_warning(root, "Objid %lx+%d not found in site %lx, " + "ignore it\n", + oe->objs.obj[i].uuid, + oe->objs.obj[i].bid, site); + } + } + om_put_osd(oe); + + __om_remove_osd(site); + + return 0; +} + +/* API: query on object to find active OSD list + */ +struct osd_list *om_query_obj(struct objid id) +{ + struct obj_entry *oe; + struct osd_list *ol; + int i; + + oe = om_get_obj(id); + if (!oe) { + return ERR_PTR(-ENOENT); + } + /* lock the osd_array */ +retry: + if (atomic_inc_return(&oe->lock) > 1) { + atomic_dec(&oe->lock); + sched_yield(); + goto retry; + } + ol = xzalloc(sizeof(*ol) + oe->sites.size * sizeof(u64)); + if (ol) { + /* copy the osd array */ + for (i = 0; i < oe->sites.size; i++) { + ol->site[i] = oe->sites.site[i]; + } + ol->size = oe->sites.size; + } + atomic_dec(&oe->lock); + om_put_obj(oe); + + return ol; +} + +static inline +int __serv_request(void) +{ + struct xnet_msg *msg = NULL, *pos, *n; + struct obj_report_tx *ort; + int err = 0, i; + + xlock_lock(&om.qlock); + list_for_each_entry_safe(pos, n, &om.queue, list) { + list_del_init(&pos->list); + msg = pos; + break; + } + xlock_unlock(&om.qlock); + + if (!msg) + return -EHSTOP; + + /* ok, deal with the object report */ + if (msg->tx.len < sizeof(*ort)) { + hvfs_err(root, "Invalid OBJ REPORT %d received len %d from %lx\n", + msg->tx.reqno, msg->tx.len, msg->tx.ssite_id); + err = -EINVAL; + goto out; + } + + ort = msg->xm_data; + if ((ort->add_size + ort->rmv_size) * sizeof(struct objid) > msg->tx.len) { + hvfs_err(root, "Partial OBJ REPORT received (%ld,%d) from %lx\n", + (ort->add_size + ort->rmv_size) * sizeof(struct objid), + msg->tx.len, msg->tx.ssite_id); + err = -EINVAL; + goto out; + } + atomic64_inc(&hro.prof.osd.objrep_handled); + + if (ort->add_size < 0) { + /* remove old objects */ + om_del_osd(msg->tx.ssite_id); + ort->add_size = -ort->add_size; + } + + /* update report content to OM's obj/site table */ + for (i = 0; i < ort->add_size; i++) { + /* find the old object */ + err = om_add_obj_site(ort->ids[i], msg->tx.ssite_id); + if (err) { + hvfs_err(root, "add object %lx+%d site %lx failed.\n", + ort->ids[i].uuid, ort->ids[i].bid, msg->tx.ssite_id); + } + } + for (; i < ort->rmv_size + ort->add_size; i++) { + err = om_del_obj_site(ort->ids[i], msg->tx.ssite_id); + if (err) { + hvfs_err(root, "del object %lx+%d site %lx failed.\n", + ort->ids[i].uuid, ort->ids[i].bid, msg->tx.ssite_id); + } + } + + /* do not reply to OSD site */ +out: + xnet_free_msg(msg); + + return err; +} + +static void *om_main(void *arg) +{ + sigset_t set; + int err = 0; + + /* first, let us block the SIGALRM and SIGCHLD */ + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigaddset(&set, SIGCHLD); + pthread_sigmask(SIG_BLOCK, &set, NULL); /* oh, we do not care about the + * errs */ + while (!om.om_thread_stop) { + err = sem_wait(&om.qsem); + if (err == EINTR) + continue; + hvfs_debug(root, "OM thread wakeup to handle object reports.\n"); + /* trying to handle more and more IOs */ + while (1) { + err = __serv_request(); + if (err == -EHSTOP) + break; + else if (err) { + hvfs_err(root, "OM thread handle report w/ error %d\n", + err); + } + } + } + pthread_exit(0); +} + +int om_init(u32 type) +{ + pthread_attr_t attr; + char *value; + int err = 0, i; + + memset(&om, 0, sizeof(om)); + INIT_LIST_HEAD(&om.queue); + xlock_init(&om.qlock); + sem_init(&om.qsem, 0, 0); + g_target_type = type; + + /* get the configs from env */ + HVFS_OM_GET_ENV_atof(active_ratio, value); + HVFS_OM_GET_ENV_atoi(obj_hsize, value); + HVFS_OM_GET_ENV_atoi(osd_hsize, value); + HVFS_OM_GET_ENV_option(opt_store, STORE, value); + + /* set default values */ + if (!om.conf.active_ratio) + om.conf.active_ratio = HVFS_OM_DEFAULT_ACTIVE_RATIO; + if (!om.conf.obj_hsize) + om.conf.obj_hsize = HVFS_OM_DEFAULT_OBJ_HSIZE; + if (!om.conf.osd_hsize) + om.conf.osd_hsize = HVFS_OM_DEFAULT_OSD_HSIZE; + + /* init the hash tables */ + om.obj_tab = xzalloc(om.conf.obj_hsize * sizeof(struct regular_hash_rw)); + if (!om.obj_tab) { + hvfs_err(root, "OBJECT hash table allocation failed\n"); + err = -ENOMEM; + goto out; + } + for (i = 0; i < om.conf.obj_hsize; i++) { + INIT_HLIST_HEAD(&om.obj_tab[i].h); + xrwlock_init(&om.obj_tab[i].lock); + } + om.osd_tab = xzalloc(om.conf.osd_hsize * sizeof(struct regular_hash_rw)); + if (!om.osd_tab) { + hvfs_err(root, "OSD hash table allocation failed\n"); + err = -ENOMEM; + goto out; + } + for (i = 0; i < om.conf.osd_hsize; i++) { + INIT_HLIST_HEAD(&om.osd_tab[i].h); + xrwlock_init(&om.osd_tab[i].lock); + } + + /* init the om thread */ + err = pthread_attr_init(&attr); + if (err) { + hvfs_err(root, "init pthread attr failed w/ %d\n", err); + goto out; + } + err = pthread_attr_setstacksize(&attr, (1 << 20)); + if (err) { + hvfs_err(root, "set thread stack size to 1MB failed w/ %d\n", err); + goto out; + } + err = pthread_create(&om.om_thread, &attr, &om_main, NULL); + if (err) { + hvfs_err(root, "init OM thread failed w/ %d (%s)\n", + err, strerror(err)); + goto out; + } + +out: + return err; +} + +void om_destroy(void) +{ + om.om_thread_stop = 1; + sem_post(&om.qsem); + pthread_join(om.om_thread, NULL); + sem_destroy(&om.qsem); +} diff --git a/r2/profile.c b/r2/profile.c index 2de9c10..65826ab 100644 --- a/r2/profile.c +++ b/r2/profile.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-22 17:18:49 macan> + * Time-stamp: <2012-08-10 17:33:11 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -122,6 +122,36 @@ void hvfs_mdsl_profile_setup(struct hvfs_profile_ex *hp) hp->nr = i; } +void hvfs_osd_profile_setup(struct hvfs_profile_ex *hp) +{ + int i = 0; + + HVFS_PROFILE_NAME_ADDIN(hp, i, "timestamp"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "client.objrnr"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "client.objwnr"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "client.objrbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "client.objwbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "ring.update"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "ring.size"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "mdsl.objrnr"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "mdsl.objwnr"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "mdsl.objrbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "mdsl.objwbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "misc.reqin_total"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "misc.reqin_handle"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "xnet.msg_alloc"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "xnet.msg_free"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "xnet.inbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "xnet.outbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "xnet.active_links"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "storage.wbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "storage.rbytes"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "storage.wreq"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "storage.rreq"); + HVFS_PROFILE_NAME_ADDIN(hp, i, "storage.cpbytes"); + hp->nr = i; +} + int root_setup_profile(void) { struct hvfs_profile_ex *hp; @@ -131,6 +161,11 @@ int root_setup_profile(void) size_t len; int i; + /* sanity check */ + if (!hro.conf.profiling_file) { + return -EINVAL; + } + /* Setup up mds profile */ hp = &hro.hp_mds; memset(hp, 0, sizeof(*hp)); @@ -195,6 +230,38 @@ int root_setup_profile(void) fflush(fp); hro.hp_mdsl.fp = fp; + /* Setup up osd profile */ + hp = &hro.hp_osd; + memset(hp, 0, sizeof(*hp)); + hvfs_osd_profile_setup(hp); + memset(fname, 0, sizeof(fname)); + snprintf(fname, 255, "%s.osd", hro.conf.profiling_file); + fp = fopen(fname, "w+"); + if (!fp) { + hvfs_err(xnet, "fopen() profiling file %s faield %d\n", + fname, errno); + return -EINVAL; + } + len = fwrite("## ##\n", 1, 6, fp); + if (len < 6) { + hvfs_err(xnet, "fwrite() profiling file %s failed %d\n", + fname, errno); + return -errno; + } + memset(data, 0, 4096); + len = sprintf(data, "@HVFS OSD PLOT DATA FILE :)\nlocal_ts"); + for (i = 0; i < hp->nr; i++) { + len += sprintf(data + len, " %s", hp->hpe[i].name); + } + len += sprintf(data + len, "\n"); + if (fwrite(data, 1, len, fp) < len) { + hvfs_err(xnet, "fwrite() profiling file %s failed %d\n", + fname, errno); + return -errno; + } + fflush(fp); + hro.hp_osd.fp = fp; + /* FIXME: Setup up bp profile */ /* FIXME: Setup up client profile */ @@ -285,6 +352,26 @@ int root_profile_update_mdsl(struct hvfs_profile *hp, return err; } +int root_profile_update_osd(struct hvfs_profile *hp, + struct xnet_msg *msg) +{ + int err = 0, i; + + if (hp->nr != hro.hp_osd.nr) { + hvfs_err(xnet, "Invalid OSD request from %lx, nr mismatch " + "%d vs %d\n", + msg->tx.ssite_id, hp->nr, hro.hp_osd.nr); + goto out; + } + + for (i = 0; i < hp->nr; i++) { + HVFS_PROFILE_VALUE_UPDATE(&hro.hp_osd, hp, i); + } + +out: + return err; +} + int root_profile_update_bp(struct hvfs_profile *hp, struct xnet_msg *msg) { @@ -338,6 +425,19 @@ void root_profile_flush(time_t cur) } fflush(hro.hp_mdsl.fp); + /* flush osd profile */ + memset(data, 0, sizeof(data)); + len = sprintf(data, "%ld", cur); + for (i = 0; i < hro.hp_osd.nr; i++) { + len += sprintf(data + len, " %ld", hro.hp_osd.hpe[i].value); + } + len += sprintf(data + len, "\n"); + if (fwrite(data, 1, len, hro.hp_osd.fp) < len) { + hvfs_err(xnet, "fwrite() profiling file OSD failed %d\n", + errno); + } + fflush(hro.hp_osd.fp); + /* FIXME: flush bp profile */ /* FIXME: flush client profile */ } @@ -393,7 +493,7 @@ int root_info_mdsl(u64 arg, void **buf) for (i = 0; i < hro.hp_mdsl.nr; i++) { p += sprintf(p, " -> %20s\t\t%ld\n", hro.hp_mdsl.hpe[i].name, hro.hp_mdsl.hpe[i].value); - } + } break; default: case HVFS_SYSINFO_MDSL_RATE: @@ -405,3 +505,58 @@ int root_info_mdsl(u64 arg, void **buf) out: return err; } + +int root_info_osd(u64 arg, void **buf) +{ + char *p; + int err = 0, i; + + p = xzalloc(4096 << 2); + if (!p) { + hvfs_err(root, "xzalloc() info osd buffer failed\n"); + err = -ENOMEM; + goto out; + } + *buf = (void *)p; + + switch (arg) { + default: + case HVFS_SYSINFO_OSD_RAW: + p += sprintf(p, "OSD RAW:\n"); + for (i = 0; i < hro.hp_osd.nr; i++) { + p += sprintf(p, " -> %20s\t\t%ld\n", hro.hp_osd.hpe[i].name, + hro.hp_osd.hpe[i].value); + } + break; + } + +out: + return err; +} + +int root_info_root(u64 arg, void **buf) +{ + char *p; + int err = 0; + + p = xzalloc(4096 << 2); + if (!p) { + hvfs_err(root, "xzalloc() info osd buffer failed\n"); + err = -ENOMEM; + goto out; + } + *buf = (void *)p; + + p += sprintf(p, " -> %20s\t\t%ld\n", "misc.reqin_total", + atomic64_read(&hro.prof.misc.reqin_total)); + p += sprintf(p, " -> %20s\t\t%ld\n", "misc.reqin_handle", + atomic64_read(&hro.prof.misc.reqin_handle)); + p += sprintf(p, " -> %20s\t\t%ld\n", "osd.objrep_recved", + atomic64_read(&hro.prof.osd.objrep_recved)); + p += sprintf(p, " -> %20s\t\t%ld\n", "osd.objrep_handled", + atomic64_read(&hro.prof.osd.objrep_handled)); + + +out: + return err; +} diff --git a/r2/root.c b/r2/root.c index f8137db..8c05bec 100644 --- a/r2/root.c +++ b/r2/root.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-22 17:51:22 macan> + * Time-stamp: <2012-08-13 09:11:03 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -477,6 +477,11 @@ int root_init(void) if (err) goto out_profile; + /* setup the object manager */ + err = om_init(HVFS_OM_TYPE_MASTER); + if (err) + goto out_om; + /* maks the SIGUSR1 signal for main thread */ { sigset_t set; @@ -490,6 +495,7 @@ int root_init(void) hro.state = HRO_STATE_RUNNING; hro.uptime = time(NULL); +out_om: out_profile: out_timers: out_spool: @@ -509,6 +515,7 @@ void root_destroy(void) hvfs_verbose(root, "OK, stop it now ...\n"); /* free something */ + om_destroy(); /* destroy the service thread pool */ root_spool_destroy(); diff --git a/r2/root.h b/r2/root.h index 15c678f..96673d9 100644 --- a/r2/root.h +++ b/r2/root.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-22 17:52:13 macan> + * Time-stamp: <2012-08-13 09:49:07 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -31,6 +31,7 @@ #include "mgr.h" #include "xnet.h" #include "profile.h" +#include "obj.h" struct root_conf { @@ -137,7 +138,7 @@ struct hvfs_root_object pthread_t timer_thread; /* profile section */ - struct hvfs_profile_ex hp_mds, hp_mdsl, hp_bp, hp_client; + struct hvfs_profile_ex hp_mds, hp_mdsl, hp_osd, hp_bp, hp_client; /* uptime */ time_t uptime; @@ -158,6 +159,8 @@ struct hvfs_sys_info #define HVFS_SYSINFO_SITE 1 #define HVFS_SYSINFO_MDS 2 #define HVFS_SYSINFO_MDSL 3 +#define HVFS_SYSINFO_OSD 4 +#define HVFS_SYSINFO_ROOT 5 #define HVFS_SYSINFO_ALL 100 u32 cmd; @@ -168,6 +171,7 @@ struct hvfs_sys_info #define HVFS_SYSINFO_SITE_CLIENT 3 #define HVFS_SYSINFO_SITE_BP 4 #define HVFS_SYSINFO_SITE_R2 5 +#define HVFS_SYSINFO_SITE_OSD 6 #define HVFS_SYSINFO_SITE_MASK 0x0f @@ -176,6 +180,9 @@ struct hvfs_sys_info #define HVFS_SYSINFO_MDSL_RATE 0 #define HVFS_SYSINFO_MDSL_RAW 1 + +#define HVFS_SYSINFO_OSD_RATE 0 +#define HVFS_SYSINFO_OSD_RAW 1 }; /* API Region */ @@ -234,8 +241,22 @@ int root_profile_update_bp(struct hvfs_profile *, struct xnet_msg *); int root_profile_update_client(struct hvfs_profile *, struct xnet_msg *); +int root_profile_update_osd(struct hvfs_profile *, + struct xnet_msg *); int root_setup_profile(void); void root_profile_flush(time_t); int root_info_mds(u64, void **); +int root_info_mdsl(u64, void **); +int root_info_root(u64, void **); + +/* om.c */ +struct osd_list *om_query_obj(struct objid); +int om_dispatch_objrep(struct xnet_msg *); +int om_init(u32); +void om_destroy(void); + +/* x2r.c */ +int root_do_objrep(struct xnet_msg *); +int root_do_query_obj(struct xnet_msg *); #endif diff --git a/r2/rprof.h b/r2/rprof.h index 310b17e..beffdfb 100644 --- a/r2/rprof.h +++ b/r2/rprof.h @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2010-05-07 16:46:26 macan> + * Time-stamp: <2012-08-10 10:42:54 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -46,6 +46,12 @@ struct root_misc_prof atomic64_t reqin_handle; /* # of handled requests */ }; +struct root_osd_prof +{ + atomic64_t objrep_recved; /* # of recved object reports */ + atomic64_t objrep_handled; /* # of handled object reports */ +}; + struct root_storage_prof { }; @@ -59,6 +65,7 @@ struct root_prof struct root_mdsl_prof mdsl; struct root_misc_prof misc; struct root_storage_prof storage; + struct root_osd_prof osd; struct xnet_prof *xnet; }; diff --git a/r2/x2r.c b/r2/x2r.c index 14fc0e4..3fd9166 100644 --- a/r2/x2r.c +++ b/r2/x2r.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-22 17:56:20 macan> + * Time-stamp: <2012-08-10 15:18:59 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,6 +34,7 @@ int __prepare_xnet_msg(struct xnet_msg *msg, struct xnet_msg **orpy) rpy = xnet_alloc_msg(XNET_MSG_NORMAL); if (!rpy) { hvfs_err(root, "xnet_alloc_msg() reply failed.\n"); + *orpy = NULL; return -ENOMEM; } #ifdef XNET_EAGER_WRITEV @@ -1302,6 +1303,8 @@ int root_do_profile(struct xnet_msg *msg) err = root_profile_update_bp(hp, msg); } else if (HVFS_IS_CLIENT(msg->tx.ssite_id)) { err = root_profile_update_client(hp, msg); + } else if (HVFS_IS_OSD(msg->tx.ssite_id)) { + err = root_profile_update_osd(hp, msg); } else { hvfs_err(root, "Invalid source site(%lx), type is ??\n", msg->tx.ssite_id); @@ -1354,10 +1357,17 @@ int root_do_info(struct xnet_msg *msg) break; case HVFS_SYSINFO_ALL: /* stick a uptime buffer */ - snprintf(tbuf, 32, "ROOT Server Uptime %ds\n", - time(NULL) - hro.uptime); + snprintf(tbuf, 32, "ROOT Server Uptime %lds\n", + (long)(time(NULL) - hro.uptime)); xnet_msg_add_sdata(rpy, tbuf, strlen(tbuf)); /* fall through */ + case HVFS_SYSINFO_ROOT: + err = root_info_root(hsi->arg0, &buf); + if (!err && buf) { + xnet_msg_add_sdata(rpy, buf, strlen(buf)); + } + if (hsi->cmd == HVFS_SYSINFO_ROOT) + break; case HVFS_SYSINFO_SITE: err = root_info_site(hsi->arg0, &buf); if (!err && buf) { @@ -1391,3 +1401,55 @@ int root_do_info(struct xnet_msg *msg) return err; } + +/* do_objrep recv the object report from osd site and move it to om.queue + * + * DO NOT TOUCH the message + */ +int root_do_objrep(struct xnet_msg *msg) +{ + return om_dispatch_objrep(msg); +} + +/* do_query_obj recv the cmd from a site and response with the corresponding + * osd list. + * + * ABI: xmdata saves objid structure + * + * Return format: | tx.err: ?(-ENOENT) + */ +int root_do_query_obj(struct xnet_msg *msg) +{ + struct objid *id; + struct xnet_msg *rpy = NULL; + struct osd_list *ol; + int err = 0; + + if (msg->tx.len < sizeof(*id) || + !msg->xm_datacheck) { + hvfs_err(root, "Invalid query obj request from site %lx\n", + msg->tx.ssite_id); + err = -EINVAL; + goto out; + } + + err = __prepare_xnet_msg(msg, &rpy); + if (err) { + goto out; + } + + id = msg->xm_data; + + ol = om_query_obj(*id); + if (!IS_ERR(ol)) { + xnet_msg_add_sdata(rpy, ol, sizeof(*ol) + ol->size * sizeof(u64)); + } else + err = PTR_ERR(ol); + + __root_send_rpy(rpy, err); + +out: + xnet_free_msg(msg); + + return err; +} diff --git a/test/bdb/Makefile b/test/bdb/Makefile index a8a131a..6a49fde 100644 --- a/test/bdb/Makefile +++ b/test/bdb/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-04-06 16:33:25 macan> +# Time-stamp: <2012-08-10 14:00:43 macan> # # This is the makefile for HVFS project. # @@ -20,10 +20,10 @@ BDB_INC_PATH = $(BDB_MAIN)/include BDB_LIB_PATH = $(BDB_MAIN)/lib all : bdb_test - @echo "BDB test targets are ready." + @$(ECHO) "BDB test targets are ready." bdb_test : $(BDB_TEST) - @echo -e " " CC"\t" bdb_test.c + @$(ECHO) -e " " CC"\t" bdb_test.c @gcc bdb_test.c -o bdb_test.ut -I$(BDB_INC_PATH) -L$(BDB_LIB_PATH) -ldb clean : diff --git a/test/fuse/Makefile b/test/fuse/Makefile index dbfd4a3..31fd38e 100644 --- a/test/fuse/Makefile +++ b/test/fuse/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-12-29 15:39:17 macan> +# Time-stamp: <2012-08-10 14:01:14 macan> # # This is the makefile for HVFS project. # @@ -13,11 +13,11 @@ include ../../Makefile.inc all : $(TEST_FUSE_SOURCE:.c=.ut) %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $< -DUNIT_TEST -c %.ut : %.o $(XNET_LIB) $(MDSL_LIB) $(HVFS_LIB) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -o $@ $< -L$(LIB_PATH) -lhvfs -L$(BRANCH) \ -lbranch $(LFLAGS) -L$(XNET) -lxnet diff --git a/test/fuse/dbsearch.c b/test/fuse/dbsearch.c index b358bb5..cd8686b 100644 --- a/test/fuse/dbsearch.c +++ b/test/fuse/dbsearch.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-12-30 16:11:31 macan> + * Time-stamp: <2012-08-16 14:09:24 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +32,7 @@ #include #include #include +#ifdef USE_BDB #include "db.h" static char *hvfs_home = "/mnt/hvfs/testC"; @@ -401,7 +402,8 @@ void __hvfs_bench_fina_bdb(void) printf("Close the environment failed w %d\n", err); } sprintf(cmd, "rm -rf %s/*", hvfs_home); - system(cmd); + if (system(cmd) < 0) + printf("system(%s) failed!\n", cmd); } int __hvfs_bdb_set(int nr) @@ -568,7 +570,8 @@ int main(int argc, char *argv[]) fflush(stdout); { char key; - scanf("%c", &key); + if (scanf("%c", &key) <= 0) + return 0; } gettimeofday(&begin, NULL); @@ -626,3 +629,11 @@ int main(int argc, char *argv[]) return 0; } +#else +int main(int argc, char *argv[]) +{ + printf("Please enable BDB support!\n"); + + return 0; +} +#endif diff --git a/test/fuse/xattr.c b/test/fuse/xattr.c index b12d21b..831c149 100644 --- a/test/fuse/xattr.c +++ b/test/fuse/xattr.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-10-04 06:51:40 macan> + * Time-stamp: <2012-08-10 15:09:38 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +32,7 @@ #include #define HVFS_MDU_IF_LARGE 0x04000000 /* large file */ +#define __UNUSED__ __attribute__((unused)) struct base_dbs { @@ -85,9 +86,9 @@ int __native_optest(int argc, char *argv[]) { char buf[4096]; char *p = NULL, *s = NULL; - unsigned long itbid; + unsigned long __UNUSED__ itbid; ssize_t len; - off_t offset; + off_t __UNUSED__ offset; int err = 0; err = open("./xattr.native", O_CREAT, S_IRUSR | S_IWUSR); @@ -151,7 +152,12 @@ int __native_optest(int argc, char *argv[]) int __native_umftest(int argc, char *argv[]) { - char buf[4096]; + union __abc__ + { + char buf[4096]; + unsigned int ibuf[4096/sizeof(unsigned int)]; + } abc; + int err = 0; err = mkdir("./xattr.umf", 0777); @@ -162,8 +168,8 @@ int __native_umftest(int argc, char *argv[]) } /* setxattr */ - sprintf(buf, "pfs.native.0.umf.set.%d", HVFS_MDU_IF_LARGE); - err = setxattr("./xattr.umf", buf, NULL, 0, 0); + sprintf(abc.buf, "pfs.native.0.umf.set.%d", HVFS_MDU_IF_LARGE); + err = setxattr("./xattr.umf", abc.buf, NULL, 0, 0); if (err) { perror("setxattr('./xattr.umf'):"); err = errno; @@ -171,14 +177,14 @@ int __native_umftest(int argc, char *argv[]) } /* check the flag value */ - sprintf(buf, "pfs.native.0.umf.cat"); - err = getxattr("./xattr.umf", buf, buf, sizeof(buf)); + sprintf(abc.buf, "pfs.native.0.umf.cat"); + err = getxattr("./xattr.umf", abc.buf, abc.buf, sizeof(abc.buf)); if (err < 0) { perror("getxattr('./xattr.umf'):"); err = errno; goto out_rmdir; } - if (*(unsigned int *)buf & HVFS_MDU_IF_LARGE) { + if (abc.ibuf[0] & HVFS_MDU_IF_LARGE) { printf("OK to cat the LARGE flag.\n"); } else { printf("BAD, failed with LARGE flag.\n"); diff --git a/test/mds/Makefile b/test/mds/Makefile index a74cd3f..53ef754 100644 --- a/test/mds/Makefile +++ b/test/mds/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-11-06 22:00:25 macan> +# Time-stamp: <2012-08-10 13:54:36 macan> # # This is the makefile for HVFS project. # @@ -15,11 +15,11 @@ TEST_MDS_DEPSRC = $(XNET)/xnet.c all : $(TEST_MDS_SOURCE:.c=.ut) %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $< -DUNIT_TEST -c %.ut : %.o $(MDS_LIB) $(HVFS_LIB) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $(TEST_MDS_DEPSRC) -o $@ $< -L$(MDS) -lmds \ -L$(LIB_PATH) -lhvfs $(LFLAGS) diff --git a/test/mds/perr.c b/test/mds/perr.c new file mode 100644 index 0000000..202f1f9 --- /dev/null +++ b/test/mds/perr.c @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2009 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-10 10:35:17 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xtable.h" +#include "tx.h" +#include "xnet.h" +#include "mds.h" +#include "lib.h" + +int main(int argc, char *argv[]) +{ + int err = 0, i; + + if (argc <= 1) { + hvfs_info(mds, "Usage: %s [errno] [errno] ...\n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) { + err = atoi(argv[i]); + if (err < 0) + err = -err; + hvfs_info(mds, "errno %d : %s\n", err, strerror(err)); + } + + return 0; +} diff --git a/test/mdsl/Makefile b/test/mdsl/Makefile index a6f78cf..84ec532 100644 --- a/test/mdsl/Makefile +++ b/test/mdsl/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-11-06 22:01:38 macan> +# Time-stamp: <2012-08-10 14:01:38 macan> # # This is the makefile for HVFS project. # @@ -13,11 +13,11 @@ include ../../Makefile.inc all : $(TEST_MDSL_SOURCE:.c=.ut) %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $< -DUNIT_TEST -c %.ut : %.o $(XNET_LIB) $(MDSL_LIB) $(HVFS_LIB) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -o $@ $< -L$(MDSL) -lmdsl -L$(XNET) -lxnet\ -L$(LIB_PATH) -lhvfs $(LFLAGS) diff --git a/test/mdsl/bulktest.c b/test/mdsl/bulktest.c index 16266c9..9500174 100644 --- a/test/mdsl/bulktest.c +++ b/test/mdsl/bulktest.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-03-03 12:35:27 macan> + * Time-stamp: <2012-08-10 17:14:21 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -174,7 +174,10 @@ int main(int argc, char *argv[]) mdsl_verify(); /* drop the cache now */ - system("echo 3 > /proc/sys/vm/drop_caches"); + if (system("echo 3 > /proc/sys/vm/drop_caches") < 0) { + hvfs_err(mdsl, "drop cache failed! %s\n", strerror(errno)); + return errno; + } err = __test_bulk_load(duuid, column, &begin, &end); if (err) { @@ -187,7 +190,10 @@ int main(int argc, char *argv[]) (end.tv_usec - begin.tv_usec))); /* drop the cache now */ - system("echo 3 > /proc/sys/vm/drop_caches"); + if (system("echo 3 > /proc/sys/vm/drop_caches") < 0) { + hvfs_err(mdsl, "drop cache failed! %s\n", strerror(errno)); + return errno; + } err = __test_rand_load(duuid, column, &begin, &end); if (err) { diff --git a/test/python/client.py b/test/python/client.py index 361bce1..5121907 100644 --- a/test/python/client.py +++ b/test/python/client.py @@ -3,7 +3,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2011-10-10 04:57:43 macan> +# Time-stamp: <2012-08-07 18:20:59 macan> # # Armed with EMACS. # @@ -1345,7 +1345,7 @@ def do_analysestorage(self, line): site = 0 type = 1 max = 0 - len = 0 + length = 8 l = shlex.split(line) try: @@ -1371,8 +1371,12 @@ def do_analysestorage(self, line): site = HVFS_MDS(site) print "Analyse TXG log file for site: %x" % site + c_data = c_void_p(max) + c_len = c_long(length) err = api.hvfs_analyse_storage(c_long(site), c_int(type), - byref(max), byref(len)) + byref(c_data), byref(c_len)) + if type == 1: + max = c_data.value if err != 0: print "api.hvfs_analyse_storage() failed w/ %d" % err return diff --git a/test/result/buglist.txt b/test/result/buglist.txt new file mode 100644 index 0000000..b6440c1 --- /dev/null +++ b/test/result/buglist.txt @@ -0,0 +1,27 @@ +1. Under high memory usage (w/o memlimit) + +*** glibc detected *** /home/macan/Work/Pomegranate/test/xnet/mds.ut: double free or corruption (fasttop): 0x00007f55d8d56fa0 *** +======= Backtrace: ========= +/lib/x86_64-linux-gnu/libc.so.6(+0x7e626)[0x7f561f12e626] +/home/macan/Work/Pomegranate/test/xnet/mds.ut(__au_req_handle+0x98)[0x45c068] +/home/macan/Work/Pomegranate/test/xnet/mds.ut(async_update+0xc2)[0x45c222] +/lib/x86_64-linux-gnu/libpthread.so.0(+0x7e9a)[0x7f561f474e9a] +/lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x7f561f1a24bd] + +2. Under high memory usage (w/ memlimit) + +[ERR ] HVFS (/home/macan/Work/Pomegranate/mds/itb.c, 1238): get_free_itb[7f24349cb700]: Pause modify operations @ Fri Aug 17 10:01:27 2012 +[INFO] Entering new txg 3 (mp forced) +[INFO] OK, we reset the itimer to 125000 us. +[WARN] HVFS (/home/macan/Work/Pomegranate/mds/txg.c, 1229): txg_ddht_compact[7f24359d3700]: In txg 2 compact 0 dir delta(s) +[WARN] HVFS (/home/macan/Work/Pomegranate/mds/mds.c, 499): mds_cbht_evict_default[7f242e7fc700]: DO evict on clean ITB 0 txg 1 +[ERR ] HVFS (/home/macan/Work/Pomegranate/mds/itb.c, 1238): get_free_itb[7f242ffff700]: Pause modify operations @ Fri Aug 17 10:01:27 2012 +[INFO] Recv ^[[0;40;31mSIGSEGV/SIGBUS/SIGABRT^[[0m address not mapped to object @ addr 0x7f242c06d030 +[INFO] /home/macan/Work/Pomegranate/test/xnet/mds.ut(lib_segv+0x17) [0x4a7757] +[INFO] /home/macan/Work/Pomegranate/test/xnet/mds.ut(mds_sigaction_default+0x46) [0x448196] +[INFO] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7f24363a3cb0] +[INFO] /lib/x86_64-linux-gnu/libpthread.so.0(pthread_rwlock_unlock+0x7) [0x7f243639f8a7] +[INFO] /home/macan/Work/Pomegranate/test/xnet/mds.ut(txg_commit+0x3ab) [0x44ae5b] +[INFO] /lib/x86_64-linux-gnu/libpthread.so.0(+0x7e9a) [0x7f243639be9a] +[INFO] /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d) [0x7f24360c94bd] +[INFO] SIGSEGV info: signo 11 errno 0 code 1 diff --git a/test/xnet/Makefile b/test/xnet/Makefile index ae283fa..b5d458f 100644 --- a/test/xnet/Makefile +++ b/test/xnet/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2012-05-18 12:42:31 macan> +# Time-stamp: <2012-08-10 13:54:21 macan> # # This is the makefile for HVFS project. # @@ -12,24 +12,24 @@ include ../../Makefile.inc ifdef USE_FUSE DEPEND_LIBS = $(XNET_LIB) $(MDS_LIB) $(HVFS_LIB) $(MDSL_LIB) $(R2_LIB) \ - $(API_LIB) $(FUSE_LIB) $(BRANCH_LIB) + $(API_LIB) $(FUSE_LIB) $(BRANCH_LIB) $(OSD_LIB) EX_LFLAGS = -lpfuse else DEPEND_LIBS = $(XNET_LIB) $(MDS_LIB) $(HVFS_LIB) $(MDSL_LIB) $(R2_LIB) \ - $(API_LIB) $(BRANCH_LIB) + $(API_LIB) $(BRANCH_LIB) $(OSD_LIB) endif all : $(TEST_XNET_SOURCE:.c=.ut) %.o : %.c $(mds_h_depend_files) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) $< -DUSE_XNET_SIMPLE -DUNIT_TEST -c %.ut : %.o $(DEPEND_LIBS) - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -o $@ $< -L$(API) -lapi \ -L$(XNET) -lxnet -L$(MDS) -lmds \ - -L$(R2) -lr2 -L$(MDSL) -lmdsl \ + -L$(R2) -lr2 -L$(MDSL) -lmdsl -L$(OSD) -losd \ -L$(FUSE) $(EX_LFLAGS) -L$(API) -lapi -L$(R2) -lr2 \ -L$(BRANCH) -lbranch $(LFLAGS) -L$(LIB_PATH) -lhvfs $(LFLAGS) diff --git a/test/xnet/ausplit.c b/test/xnet/ausplit.c index f885233..5ad809b 100644 --- a/test/xnet/ausplit.c +++ b/test/xnet/ausplit.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2010-11-02 18:17:54 macan> + * Time-stamp: <2012-08-10 13:44:58 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -810,9 +810,8 @@ int main(int argc, char *argv[]) }; int err = 0; int type = 0; - int self, sport, i, j, thread; + int self, sport, i, j, thread, op; long entry; - int op, memonly, memlimit; char *value; char profiling_fname[256]; @@ -840,16 +839,6 @@ int main(int argc, char *argv[]) } else { thread = 1; } - value = getenv("memonly"); - if (value) { - memonly = atoi(value); - } else - memonly = 1; - value = getenv("memlimit"); - if (value) { - memlimit = atoi(value); - } else - memlimit = 0; pthread_barrier_init(&barrier, NULL, thread); diff --git a/test/xnet/m2m.c b/test/xnet/m2m.c index 05feb0d..191ae6f 100644 --- a/test/xnet/m2m.c +++ b/test/xnet/m2m.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2010-11-02 18:17:38 macan> + * Time-stamp: <2012-08-10 13:42:52 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -716,7 +716,7 @@ int main(int argc, char *argv[]) dh_remove(hmi.gdt_uuid); st_destroy(); mds_destroy(); - return 0; + return err; } #endif diff --git a/test/xnet/mds.c b/test/xnet/mds.c index e55e828..26b8f59 100644 --- a/test/xnet/mds.c +++ b/test/xnet/mds.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-18 11:59:35 macan> + * Time-stamp: <2012-08-10 13:43:47 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1102,7 +1102,7 @@ int main(int argc, char *argv[]) }; int err = 0; int self, sport = -1, i, j; - int memonly, memlimit, mode, plot_method; + int mode, plot_method; char *value; char *ring_ip = NULL; char profiling_fname[256], *log_home; @@ -1124,18 +1124,6 @@ int main(int argc, char *argv[]) ring_ip = argv[2]; } - value = getenv("memonly"); - if (value) { - memonly = atoi(value); - } else - memonly = 1; - - value = getenv("memlimit"); - if (value) { - memlimit = atoi(value); - } else - memlimit = 0; - value = getenv("mode"); if (value) { mode = atoi(value); diff --git a/test/xnet/mdsl.c b/test/xnet/mdsl.c index 3f572df..b7c93f9 100644 --- a/test/xnet/mdsl.c +++ b/test/xnet/mdsl.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-18 11:59:46 macan> + * Time-stamp: <2012-08-07 17:15:40 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -568,6 +568,9 @@ int main(int argc, char *argv[]) mdsl_pre_init(); hmo.conf.prof_plot = plot_method; mdsl_config(); + + /* BUG-XXXX: we have set the site_id BEFORE mdsl_init() */ + hmo.site_id = HVFS_MDSL(self); err = mdsl_init(); if (err) { hvfs_err(xnet, "mdsl_init() failed %d\n", err); diff --git a/test/xnet/osd.c b/test/xnet/osd.c new file mode 100644 index 0000000..78d20fe --- /dev/null +++ b/test/xnet/osd.c @@ -0,0 +1,543 @@ +/** + * Copyright (c) 2009 Ma Can + * + * + * Armed with EMACS. + * Time-stamp: <2012-08-07 17:25:02 macan> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "hvfs.h" +#include "xnet.h" +#include "osd.h" +#include "lib.h" +#include "ring.h" +#include "root.h" + +#ifdef UNIT_TEST +#define TYPE_MDS 0 +#define TYPE_CLIENT 1 +#define TYPE_MDSL 2 +#define TYPE_RING 3 +#define TYPE_OSD 4 + +u64 fsid = 0; + +char *ipaddr[] = { + "127.0.0.1", /* mds */ + "127.0.0.1", /* client */ + "127.0.0.1", /* mdsl */ + "127.0.0.1", /* ring */ + "127.0.0.1", /* osd */ +}; + +short port[5][5] = { + {8210, 8211, 8212, 8213, 8214,}, /* mds */ + {8412, 8413, 8414, 8415,}, /* client */ + {8810, 8811, 8812, 8813,}, /* mdsl */ + {8710, 8711, 8712, 8713,}, /* ring */ + {9200,}, /* osd */ +}; + +#define HVFS_TYPE(type, idx) ({ \ + u64 __sid = -1UL; \ + switch (type){ \ + case TYPE_MDS: \ + __sid = HVFS_MDS(idx); \ + break; \ + case TYPE_CLIENT: \ + __sid = HVFS_CLIENT(idx); \ + break; \ + case TYPE_MDSL: \ + __sid = HVFS_MDSL(idx); \ + break; \ + case TYPE_RING: \ + __sid = HVFS_RING(idx); \ + break; \ + case TYPE_OSD: \ + __sid = HVFS_OSD(idx); \ + break; \ + default:; \ + } \ + __sid; \ + }) + +static inline +u64 HVFS_TYPE_SEL(int type, int id) +{ + u64 site_id = -1UL; + + switch (type) { + case TYPE_MDS: + site_id = HVFS_MDS(id); + break; + case TYPE_CLIENT: + site_id = HVFS_CLIENT(id); + break; + case TYPE_MDSL: + site_id = HVFS_MDSL(id); + break; + case TYPE_RING: + site_id = HVFS_RING(id); + break; + case TYPE_OSD: + site_id = HVFS_OSD(id); + break; + default:; + } + + return site_id; +} + +int msg_wait() +{ + while (1) { + xnet_wait_any(hoo.xc); + } + return 0; +} + +struct chring *chring_tx_to_chring(struct chring_tx *ct) +{ + return (struct chring *)1; /* just reutrn a non-NULL value */ +} + +/* r2cli_do_reg() + * + * @gid: already right shift 2 bits + */ +int r2cli_do_reg(u64 request_site, u64 root_site, u64 fsid, u32 gid) +{ + struct xnet_msg *msg; + int err = 0; + + /* alloc one msg and send it to the peer site */ + msg = xnet_alloc_msg(XNET_MSG_NORMAL); + if (!msg) { + hvfs_err(xnet, "xnet_alloc_msg() failed\n"); + err = -ENOMEM; + goto out_nofree; + } + + xnet_msg_fill_tx(msg, XNET_MSG_REQ, XNET_NEED_REPLY, + hoo.xc->site_id, root_site); + xnet_msg_fill_cmd(msg, HVFS_R2_REG, request_site, fsid); +#ifdef XNET_EAGER_WRITEV + xnet_msg_add_sdata(msg, &msg->tx, sizeof(msg->tx)); +#endif + + /* send the reg request to root_site w/ requested siteid = request_site */ + msg->tx.reserved = gid; + +resend: + err = xnet_send(hoo.xc, msg); + if (err) { + hvfs_err(xnet, "xnet_send() failed\n"); + goto out; + } + + /* Reply ABI: + * @tx.arg0: network magic + */ + + /* this means we have got the reply, parse it! */ + ASSERT(msg->pair, xnet); + if (msg->pair->tx.err == -ERECOVER) { + hvfs_err(xnet, "R2 notify a client recover process on site " + "%lx, do it.\n", request_site); + } else if (msg->pair->tx.err == -EHWAIT) { + hvfs_err(xnet, "R2 reply that another instance is still alive, " + "wait a moment and retry.\n"); + xnet_free_msg(msg->pair); + msg->pair = NULL; + goto resend; + } else if (msg->pair->tx.err) { + hvfs_err(xnet, "Reg site %lx failed w/ %d\n", request_site, + msg->pair->tx.err); + err = msg->pair->tx.err; + goto out; + } + + /* parse the register reply message */ + hvfs_info(xnet, "Begin parse the reg reply message\n"); + if (msg->pair->xm_datacheck) { + void *data = msg->pair->xm_data; + void *bitmap; + union hvfs_x_info *hxi; + struct chring_tx *ct; + struct root_tx *rt; + struct hvfs_site_tx *hst; + + /* parse hxi */ + err = bparse_hxi(data, &hxi); + if (err < 0) { + hvfs_err(root, "bparse_hxi failed w/ %d\n", err); + goto out; + } + memcpy(&hoi, hxi, sizeof(hoi)); + data += err; + /* parse ring */ + err = bparse_ring(data, &ct); + if (err < 0) { + hvfs_err(root, "bparse_ring failed w/ %d\n", err); + goto out; + } + + if (!chring_tx_to_chring(ct)) { + hvfs_err(root, "chring_tx 2 chring failed w/ %d\n", err); + goto out; + } + data += err; + err = bparse_ring(data, &ct); + if (err < 0) { + hvfs_err(root, "bparse_ring failed w/ %d\n", err); + goto out; + } + + if (!chring_tx_to_chring(ct)) { + hvfs_err(root, "chring_tx 2 chring failed w/ %d\n", err); + goto out; + } + data += err; + err = bparse_ring(data, &ct); + if (err < 0) { + hvfs_err(root, "bparse_ring failed w/ %d\n", err); + goto out; + } + if (!chring_tx_to_chring(ct)) { + hvfs_err(root, "chring_tx 2 chring failed w/ %d\n", err); + goto out; + } + data += err; + /* parse root_tx */ + err = bparse_root(data, &rt); + if (err < 0) { + hvfs_err(root, "bparse root failed w/ %d\n", err); + goto out; + } + data += err; + hvfs_info(root, "fsid %ld gdt_uuid %ld gdt_salt %lx " + "root_uuid %ld root_salt %lx\n", + rt->fsid, rt->gdt_uuid, rt->gdt_salt, + rt->root_uuid, rt->root_salt); + /* FIXME: we do not need to insert the dh entry */ + /* dh_insert(hoi.gdt_uuid, hoi.gdt_uuid, hoi.gdt_salt); */ + + /* parse bitmap */ + err = bparse_bitmap(data, &bitmap); + if (err < 0) { + hvfs_err(root, "bparse bitmap failed w/ %d\n", err); + goto out; + } + data += err; + /* FIXME: we do not need to insert the bitmap! */ + /* bitmap_insert2(hoi.gdt_uuid, 0, bitmap, err - sizeof(u32)); */ + + /* parse addr */ + err = bparse_addr(data, &hst); + if (err < 0) { + hvfs_err(root, "bparse addr failed w/ %d\n", err); + goto out; + } + /* add the site table to the xnet */ + err = hst_to_xsst(hst, err - sizeof(u32)); + if (err) { + hvfs_err(root, "hst to xsst failed w/ %d\n", err); + } + + /* set network magic */ + xnet_set_magic(msg->pair->tx.arg0); + } + hvfs_info(xnet, "End parse the reg reply message\n"); + +out: + xnet_free_msg(msg); +out_nofree: + + return err; +} + +/* r2cli_do_unreg() + * + * @gid: already right shift 2 bits + */ +int r2cli_do_unreg(u64 request_site, u64 root_site, u64 fsid, u32 gid) +{ + struct xnet_msg *msg; + union hvfs_x_info *hxi; + int err = 0; + + hxi = (union hvfs_x_info *)&hoi; + + /* alloc one msg and send it to the perr site */ + msg = xnet_alloc_msg(XNET_MSG_NORMAL); + if (!msg) { + hvfs_err(xnet, "xnet_alloc_msg() failed\n"); + err = -ENOMEM; + goto out_nofree; + } + + xnet_msg_fill_tx(msg, XNET_MSG_REQ, XNET_NEED_REPLY, + hoo.xc->site_id, root_site); + xnet_msg_fill_cmd(msg, HVFS_R2_UNREG, request_site, fsid); +#ifdef XNET_EAGER_WRITEV + xnet_msg_add_sdata(msg, &msg->tx, sizeof(msg->tx)); +#endif + xnet_msg_add_sdata(msg, hxi, sizeof(*hxi)); + + /* send te unreeg request to root_site w/ requested siteid = request_site */ + msg->tx.reserved = gid; + + err = xnet_send(hoo.xc, msg); + if (err) { + hvfs_err(xnet, "xnet_send() failed\n"); + goto out; + } + + /* this means we have got the reply, parse it! */ + ASSERT(msg->pair, xnet); + if (msg->pair->tx.err) { + hvfs_err(xnet, "Unreg site %lx failed w/ %d\n", request_site, + msg->pair->tx.err); + err = msg->pair->tx.err; + goto out; + } + +out: + xnet_free_msg(msg); +out_nofree: + return err; +} + +/* r2cli_do_hb() + * + * @gid: already right shift 2 bits + */ +static +int r2cli_do_hb(u64 request_site, u64 root_site, u64 fsid, u32 gid) +{ + struct xnet_msg *msg; + union hvfs_x_info *hxi; + int err = 0; + + hxi = (union hvfs_x_info *)&hoi; + + /* alloc one msg and send it to the peer site */ + msg = xnet_alloc_msg(XNET_MSG_NORMAL); + if (!msg) { + hvfs_err(xnet, "xnet_alloc_msg() failed\n"); + err = -ENOMEM; + goto out_nofree; + } + + xnet_msg_fill_tx(msg, XNET_MSG_REQ, 0, + hoo.xc->site_id, root_site); + xnet_msg_fill_cmd(msg, HVFS_R2_HB, request_site, fsid); +#ifdef XNET_EAGER_WRITEV + xnet_msg_add_sdata(msg, &msg->tx, sizeof(msg->tx)); +#endif + xnet_msg_add_sdata(msg, hxi, sizeof(*hxi)); + + msg->tx.reserved = gid; + + err = xnet_send(hoo.xc, msg); + if (err) { + hvfs_err(xnet, "xnet_send() failed\n"); + goto out; + } +out: + xnet_free_msg(msg); +out_nofree: + + return err; +} + +void osd_cb_exit(void *arg) +{ + int err = 0; + + err = r2cli_do_unreg(hoo.xc->site_id, HVFS_RING(0), fsid, 0); + if (err) { + hvfs_err(xnet, "unreg self %lx w/ r2 %x failed w/ %d\n", + hoo.xc->site_id, HVFS_RING(0), err); + return; + } +} + +void osd_cb_hb(void *arg) +{ + u64 ring_site; + int err = 0; + + ring_site = osd_select_ring(&hoo); + err = r2cli_do_hb(hoo.xc->site_id, ring_site, fsid, 0); + if (err) { + hvfs_err(xnet, "hb %lx w/ r2 %x failed w/ %d\n", + hoo.xc->site_id, HVFS_RING(0), err); + } +} + +void osd_cb_addr_table_update(void *arg) +{ + struct hvfs_site_tx *hst; + void *data = arg; + int err = 0; + + hvfs_info(xnet, "Update address table ...\n"); + + err = bparse_addr(data, &hst); + if (err < 0) { + hvfs_err(xnet, "bparse_addr failed w/ %d\n", err); + goto out; + } + + err = hst_to_xsst(hst, err - sizeof(u32)); + if (err) { + hvfs_err(xnet, "hst to xsst failed w/ %d\n", err); + goto out; + } + +out: + return; +} + +int main(int argc, char *argv[]) +{ + struct xnet_type_ops ops = { + .buf_alloc = NULL, + .buf_free = NULL, + .recv_handler = osd_spool_dispatch, + .dispatcher = osd_dispatch, + }; + int err = 0; + int self, sport = -1, plot_method; + char *value; + char *ring_ip = NULL; + char profiling_fname[256], *log_home; + + hvfs_info(xnet, "OSD Unit Testing...\n"); + hvfs_info(xnet, "Usage %s id ring_ip self_port\n", argv[0]); + + if (argc < 2) { + hvfs_err(xnet, "Self ID is not provided.\n"); + err = EINVAL; + goto out; + } else { + self = atoi(argv[1]); + hvfs_info(xnet, "Self type+ID is osd:%d.\n", self); + if (argc == 4) { + ring_ip = argv[2]; + sport = atoi(argv[3]); + } else if (argc == 3) + ring_ip = argv[2]; + } + + value = getenv("fsid"); + if (value) { + fsid = atoi(value); + } else + fsid = 0; + + value = getenv("plot"); + if (value) { + plot_method = atoi(value); + } else + plot_method = OSD_PROF_PLOT; + + value = getenv("LOG_DIR"); + if (value) { + log_home = strdup(value); + } else + log_home = NULL; + + st_init(); + osd_pre_init(); + hoo.conf.prof_plot = plot_method; + osd_config(); + + /* BUG-XXXX: we have set the site_id BEFORE mdsl_init() */ + hoo.site_id = HVFS_OSD(self); + err = osd_init(); + if (err) { + hvfs_err(xnet, "osd_init() failed %d\n", err); + goto out; + } + + /* init misc configrations */ + hoo.prof.xnet = &g_xnet_prof; + + /* prepare the ring address */ + if (!ring_ip) { + xnet_update_ipaddr(HVFS_RING(0), 1, &ipaddr[3], + (short *)(&port[3][0])); + if (sport == -1) + sport = port[TYPE_OSD][0]; + } else { + xnet_update_ipaddr(HVFS_RING(0), 1, &ring_ip, + (short *)(&port[3][0])); + if (sport == -1) + sport = port[TYPE_OSD][0]; + } + + /* setup the profiling file */ + if (!log_home) + log_home = "."; + + memset(profiling_fname, 0, sizeof(profiling_fname)); + sprintf(profiling_fname, "%s/CP-BACK-osd.%d", log_home, self); + hoo.conf.pf_file = fopen(profiling_fname, "w+"); + if (!hoo.conf.pf_file) { + hvfs_err(xnet, "fopen() profiling file %s faield %d\n", + profiling_fname, errno); + return EINVAL; + } + + self = HVFS_OSD(self); + + hoo.xc = xnet_register_type(0, sport, self, &ops); + if (IS_ERR(hoo.xc)) { + err = PTR_ERR(hoo.xc); + return err; + } + + hoo.site_id = self; + + hoo.cb_exit = osd_cb_exit; + hoo.cb_hb = osd_cb_hb; + hoo.cb_addr_table_update = osd_cb_addr_table_update; + + /* use ring info to init the osd */ + err = r2cli_do_reg(self, HVFS_RING(0), fsid, 0); + if (err) { + hvfs_err(xnet, "reg self %x w/ r2 %x failed w/ %d\n", + self, HVFS_RING(0), err); + goto out; + } + + osd_verify(); + + hvfs_info(xnet, "OSD is UP for serving requests now.\n"); + + //SET_TRACING_FLAG(osd, HVFS_DEBUG); + msg_wait(); + + osd_destroy(); + xnet_unregister_type(hoo.xc); +out: + return err; +} +#endif diff --git a/test/xnet/r2cli.c b/test/xnet/r2cli.c index 4568ffe..e84fc1e 100644 --- a/test/xnet/r2cli.c +++ b/test/xnet/r2cli.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2011-07-20 16:45:31 macan> + * Time-stamp: <2012-08-10 17:39:46 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,6 +26,7 @@ #include "root.h" #include "lib.h" #include "ring.h" +#include "mds.h" #ifdef UNIT_TEST #define TYPE_MDS 0 @@ -463,6 +464,7 @@ int main(int argc, char *argv[]) int type = 0; int err = 0; int self, sport = -1, op, fsid; + char profiling_fname[256], *log_home; hvfs_info(xnet, "R2 Unit Test Client running...\n"); hvfs_info(xnet, "type 0/1/2/3 => MDS/CLIENT/MDSL/RING\n"); @@ -492,6 +494,12 @@ int main(int argc, char *argv[]) fsid = 0; } + value = getenv("LOG_DIR"); + if (value) { + log_home = strdup(value); + } else + log_home = NULL; + if (argc < 2) { hvfs_err(xnet, "Self ID is not provided.\n"); err = EINVAL; @@ -513,16 +521,32 @@ int main(int argc, char *argv[]) } st_init(); - root_pre_init(); -// SET_TRACING_FLAG(root, HVFS_DEBUG | HVFS_VERBOSE); - err = root_init(); + mds_pre_init(); + + /* init misc configurations */ + hmo.prof.xnet = &g_xnet_prof; + hmo.conf.prof_plot = 1; + //SET_TRACING_FLAG(root, HVFS_DEBUG | HVFS_VERBOSE); + + err = mds_init(10); if (err) { hvfs_err(xnet, "root_init() failed w/ %d\n", err); goto out; } - - /* init misc configurations */ - hro.prof.xnet = &g_xnet_prof; + hmo.gossip_thread_stop = 1; + + /* setup the profiling file */ + if (!log_home) + log_home = "."; + + memset(profiling_fname, 0, sizeof(profiling_fname)); + sprintf(profiling_fname, "%s/CP-BACK-r2cli.%d", log_home, self); + hmo.conf.pf_file = fopen(profiling_fname, "w+"); + if (!hmo.conf.pf_file) { + hvfs_err(xnet, "fopen() profiling file %s failed %d\n", + profiling_fname, errno); + return EINVAL; + } if (sport == -1) sport = port[type][self]; @@ -535,7 +559,7 @@ int main(int argc, char *argv[]) } hro.site_id = self; - root_verify(); + mds_verify(); /* prepare the init site table now */ if (!ring_ip) @@ -581,11 +605,11 @@ int main(int argc, char *argv[]) goto out; } - root_destroy(); + mds_destroy(); xnet_unregister_type(hro.xc); return 0; out: - root_destroy(); + mds_destroy(); return err; } #endif diff --git a/triggers/Makefile b/triggers/Makefile index 7622bfd..c064aa2 100644 --- a/triggers/Makefile +++ b/triggers/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-11-22 00:21:43 macan> +# Time-stamp: <2012-08-10 14:00:16 macan> # # This is the makefile for HVFS project. # @@ -13,14 +13,14 @@ include ../Makefile.inc all : triggers %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -c $(patsubst %.c, $(TRIGGERS)/%.c, $<) triggers : $(TRIGGERS_SOURCE:.c=.so) - @echo "Build triggers done." + @$(ECHO) "Build triggers done." %.so : %.o - @echo -e " " SL"\t" $@ + @$(ECHO) -e " " SL"\t" $@ @$(CC) -shared -Wl,-soname,lib.so.1 -o $(TRIGGERS)/$@ $^ -lc -lrt clean : diff --git a/xnet/Makefile b/xnet/Makefile index d83f25f..878f6ed 100644 --- a/xnet/Makefile +++ b/xnet/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2009 Ma Can # # -# Time-stamp: <2010-07-20 14:13:11 macan> +# Time-stamp: <2012-08-10 14:00:27 macan> # # This is the makefile for HVFS project. # @@ -13,13 +13,13 @@ include ../Makefile.inc all : xnet_lib %.o : %.c - @echo -e " " CC"\t" $@ + @$(ECHO) -e " " CC"\t" $@ @$(CC) $(CFLAGS) -DUSE_XNET_SIMPLE -c $(patsubst %.c, $(XNET)/%.c, $<) xnet_lib : $(XNET_AR_SOURCE:.c=.o) - @echo -e " " AR"\t" $@ + @$(ECHO) -e " " AR"\t" $@ @$(AR) rcs libxnet.a $(^:.c=.o) - @echo -e " " SL"\t" $(XNET_SO) + @$(ECHO) -e " " SL"\t" $(XNET_SO) @$(CC) -shared -Wl,-soname,libxnet.so.1 -o $(LIB_PATH)/libxnet.so.1.0 $(^:.c=.o) -lc -lrt -lpthread clean : diff --git a/xnet/xnet_simple.c b/xnet/xnet_simple.c index f5dadbe..f72367d 100644 --- a/xnet/xnet_simple.c +++ b/xnet/xnet_simple.c @@ -3,7 +3,7 @@ * * * Armed with EMACS. - * Time-stamp: <2012-05-18 11:37:37 macan> + * Time-stamp: <2012-08-06 15:05:15 macan> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -570,7 +570,7 @@ int __xnet_handle_tx(int fd) } sem_post(&req->event); } else if (msg->tx.type == XNET_MSG_CMD) { - /* just receive the data */ + /* same as NOP message, just receive the next msg. */ } else if (msg->tx.type == XNET_MSG_NOP) { hvfs_debug(xnet, "recv NOP message, just receive the next msg.\n"); }