From e2320d9fb830e91edf80f9226cdd60ccbe2e63b9 Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 17:53:15 +0200 Subject: [PATCH 1/6] Add config_path to Services --- flintrock/services.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/flintrock/services.py b/flintrock/services.py index 8515ed11..7fa76b4a 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -107,9 +107,10 @@ def health_check( class HDFS(FlintrockService): - def __init__(self, *, version, download_source): + def __init__(self, *, version, download_source, config_path): self.version = version self.download_source = download_source + self.config_path = config_path self.manifest = {'version': version, 'download_source': download_source} def install( @@ -149,13 +150,18 @@ def configure( cluster: FlintrockCluster): # TODO: os.walk() through these files. template_paths = [ - 'hadoop/conf/masters', - 'hadoop/conf/slaves', - 'hadoop/conf/hadoop-env.sh', - 'hadoop/conf/core-site.xml', - 'hadoop/conf/hdfs-site.xml', + 'masters', + 'slaves', + 'hadoop-env.sh', + 'core-site.xml', + 'hdfs-site.xml', ] + # TODO: throw error if config folder / file can't be found + config_dir = self.config_path + if config_dir is None: + config_dir = os.path.join(THIS_DIR, "templates/hadoop/conf") + for template_path in template_paths: ssh_check_output( client=ssh_client, @@ -164,7 +170,7 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(THIS_DIR, "templates", template_path), + path=os.path.join(config_dir, template_path), mapping=generate_template_mapping( cluster=cluster, hadoop_version=self.version, @@ -217,6 +223,7 @@ def __init__( version: str=None, hadoop_version: str, download_source: str=None, + config_path: str=None, git_commit: str=None, git_repository: str=None ): @@ -230,6 +237,7 @@ def __init__( self.version = version self.hadoop_version = hadoop_version self.download_source = download_source + self.config_path = config_path self.git_commit = git_commit self.git_repository = git_repository @@ -310,10 +318,15 @@ def configure( cluster: FlintrockCluster): template_paths = [ - 'spark/conf/spark-env.sh', - 'spark/conf/slaves', - 'spark/conf/spark-defaults.conf', + 'spark-env.sh', + 'slaves', + 'spark-defaults.conf', ] + + config_dir = self.config_path + if config_dir is None: + config_dir = os.path.join(THIS_DIR, "templates/spark/conf") + for template_path in template_paths: ssh_check_output( client=ssh_client, @@ -322,7 +335,7 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(THIS_DIR, "templates", template_path), + path=os.path.join(config_dir, template_path), mapping=generate_template_mapping( cluster=cluster, spark_executor_instances=self.spark_executor_instances, From e7badc5a481b40540367c1681bbbd0041518e752 Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 19:30:53 +0200 Subject: [PATCH 2/6] Added {service}_config_path to click --- flintrock/flintrock.py | 10 +++++++++- flintrock/services.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index 04948499..41286712 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -218,6 +218,7 @@ def cli(cli_context, config, provider, debug): help="URL to download Hadoop from.", default='http://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-{v}/hadoop-{v}.tar.gz?as_json', show_default=True) +@click.option('--hdfs-config-path') @click.option('--install-spark/--no-install-spark', default=True) @click.option('--spark-executor-instances', default=1, help="How many executor instances per worker.") @@ -239,6 +240,7 @@ def cli(cli_context, config, provider, debug): help="Git repository to clone Spark from.", default='https://github.com/apache/spark', show_default=True) +@click.option('--spark-config-path') @click.option('--assume-yes/--no-assume-yes', default=False) @click.option('--ec2-key-name') @click.option('--ec2-identity-file', @@ -281,12 +283,14 @@ def launch( install_hdfs, hdfs_version, hdfs_download_source, + hdfs_config_path, install_spark, spark_executor_instances, spark_version, spark_git_commit, spark_git_repository, spark_download_source, + spark_config_path, assume_yes, ec2_key_name, ec2_identity_file, @@ -351,7 +355,9 @@ def launch( check_external_dependency('ssh-keygen') if install_hdfs: - hdfs = HDFS(version=hdfs_version, download_source=hdfs_download_source) + hdfs = HDFS(version=hdfs_version, + download_source=hdfs_download_source, + config_path=hdfs_config_path) services += [hdfs] if install_spark: if spark_version: @@ -360,6 +366,7 @@ def launch( version=spark_version, hadoop_version=hdfs_version, download_source=spark_download_source, + config_path=spark_config_path ) elif spark_git_commit: logger.warning( @@ -373,6 +380,7 @@ def launch( git_commit=spark_git_commit, git_repository=spark_git_repository, hadoop_version=hdfs_version, + config_path=spark_config_path ) services += [spark] diff --git a/flintrock/services.py b/flintrock/services.py index 7fa76b4a..bda14610 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -179,7 +179,7 @@ def configure( spark_version='', spark_executor_instances=0, ))), - p=shlex.quote(template_path))) + p=shlex.quote(os.path.join("hadoop/conf/",template_path)))) # TODO: Convert this into start_master() and split master- or slave-specific # stuff out of configure() into configure_master() and configure_slave(). @@ -342,7 +342,7 @@ def configure( hadoop_version=self.hadoop_version, spark_version=self.version or self.git_commit, ))), - p=shlex.quote(template_path))) + p=shlex.quote(os.path.join("spark/conf/", template_path)))) # TODO: Convert this into start_master() and split master- or slave-specific # stuff out of configure() into configure_master() and configure_slave(). From cc0757c4558d08ed19e6d8fd6c1346bdae6de20a Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 19:47:12 +0200 Subject: [PATCH 3/6] Rename config_path to template_dir, add field to YAML template --- flintrock/config.yaml.template | 10 ++++++++++ flintrock/flintrock.py | 14 +++++++------- flintrock/services.py | 24 ++++++++++++------------ 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template index 460552a0..2d45cb9e 100644 --- a/flintrock/config.yaml.template +++ b/flintrock/config.yaml.template @@ -9,12 +9,22 @@ services: # - must be a tar.gz file # download-source: "https://www.example.com/files/spark/{v}/spark-{v}.tar.gz" # executor-instances: 1 + # template-dir: # folder containing spark configuration files: + # - slaves + # - spark-env.sh + # - spark-defaults.conf hdfs: version: 2.7.3 # optional; defaults to download from a dynamically selected Apache mirror # - must contain a {v} template corresponding to the version # - must be a .tar.gz file # download-source: "https://www.example.com/files/hadoop/{v}/hadoop-{v}.tar.gz" + # template-dir: # path to the folder containing the hadoop configuration files + # - masters + # - slaves + # - hadoop-env.sh + # - hdfs-site.xml + # - core-site.xml provider: ec2 diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index 41286712..c538f07e 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -218,7 +218,7 @@ def cli(cli_context, config, provider, debug): help="URL to download Hadoop from.", default='http://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-{v}/hadoop-{v}.tar.gz?as_json', show_default=True) -@click.option('--hdfs-config-path') +@click.option('--hdfs-template-dir') @click.option('--install-spark/--no-install-spark', default=True) @click.option('--spark-executor-instances', default=1, help="How many executor instances per worker.") @@ -240,7 +240,7 @@ def cli(cli_context, config, provider, debug): help="Git repository to clone Spark from.", default='https://github.com/apache/spark', show_default=True) -@click.option('--spark-config-path') +@click.option('--spark-template-dir') @click.option('--assume-yes/--no-assume-yes', default=False) @click.option('--ec2-key-name') @click.option('--ec2-identity-file', @@ -283,14 +283,14 @@ def launch( install_hdfs, hdfs_version, hdfs_download_source, - hdfs_config_path, + hdfs_template_dir, install_spark, spark_executor_instances, spark_version, spark_git_commit, spark_git_repository, spark_download_source, - spark_config_path, + spark_template_dir, assume_yes, ec2_key_name, ec2_identity_file, @@ -357,7 +357,7 @@ def launch( if install_hdfs: hdfs = HDFS(version=hdfs_version, download_source=hdfs_download_source, - config_path=hdfs_config_path) + template_dir=hdfs_template_dir) services += [hdfs] if install_spark: if spark_version: @@ -366,7 +366,7 @@ def launch( version=spark_version, hadoop_version=hdfs_version, download_source=spark_download_source, - config_path=spark_config_path + template_dir=spark_template_dir ) elif spark_git_commit: logger.warning( @@ -380,7 +380,7 @@ def launch( git_commit=spark_git_commit, git_repository=spark_git_repository, hadoop_version=hdfs_version, - config_path=spark_config_path + template_dir=spark_template_dir ) services += [spark] diff --git a/flintrock/services.py b/flintrock/services.py index bda14610..4a9fa51a 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -107,10 +107,10 @@ def health_check( class HDFS(FlintrockService): - def __init__(self, *, version, download_source, config_path): + def __init__(self, *, version, download_source, template_dir): self.version = version self.download_source = download_source - self.config_path = config_path + self.template_dir = template_dir self.manifest = {'version': version, 'download_source': download_source} def install( @@ -158,9 +158,9 @@ def configure( ] # TODO: throw error if config folder / file can't be found - config_dir = self.config_path - if config_dir is None: - config_dir = os.path.join(THIS_DIR, "templates/hadoop/conf") + template_dir = self.template_dir + if template_dir is None: + template_dir = os.path.join(THIS_DIR, "templates/hadoop/conf") for template_path in template_paths: ssh_check_output( @@ -170,7 +170,7 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(config_dir, template_path), + path=os.path.join(template_dir, template_path), mapping=generate_template_mapping( cluster=cluster, hadoop_version=self.version, @@ -223,7 +223,7 @@ def __init__( version: str=None, hadoop_version: str, download_source: str=None, - config_path: str=None, + template_dir: str=None, git_commit: str=None, git_repository: str=None ): @@ -237,7 +237,7 @@ def __init__( self.version = version self.hadoop_version = hadoop_version self.download_source = download_source - self.config_path = config_path + self.template_dir = template_dir self.git_commit = git_commit self.git_repository = git_repository @@ -323,9 +323,9 @@ def configure( 'spark-defaults.conf', ] - config_dir = self.config_path - if config_dir is None: - config_dir = os.path.join(THIS_DIR, "templates/spark/conf") + template_dir = self.template_dir + if template_dir is None: + template_dir = os.path.join(THIS_DIR, "templates/spark/conf") for template_path in template_paths: ssh_check_output( @@ -335,7 +335,7 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(config_dir, template_path), + path=os.path.join(template_dir, template_path), mapping=generate_template_mapping( cluster=cluster, spark_executor_instances=self.spark_executor_instances, From 2431d035bd050683ec59f263fff99709dc5f9ae8 Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 20:22:01 +0200 Subject: [PATCH 4/6] Formatting --- flintrock/flintrock.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index c538f07e..14d855b6 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -355,9 +355,11 @@ def launch( check_external_dependency('ssh-keygen') if install_hdfs: - hdfs = HDFS(version=hdfs_version, - download_source=hdfs_download_source, - template_dir=hdfs_template_dir) + hdfs = HDFS( + version=hdfs_version, + download_source=hdfs_download_source, + template_dir=hdfs_template_dir + ) services += [hdfs] if install_spark: if spark_version: From 29e33fd696c33d139b5d977c2b66a34684f48c00 Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 21:11:12 +0200 Subject: [PATCH 5/6] Flake8 compliance --- flintrock/services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flintrock/services.py b/flintrock/services.py index 4a9fa51a..00717e6d 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -179,7 +179,7 @@ def configure( spark_version='', spark_executor_instances=0, ))), - p=shlex.quote(os.path.join("hadoop/conf/",template_path)))) + p=shlex.quote(os.path.join("hadoop/conf/", template_path)))) # TODO: Convert this into start_master() and split master- or slave-specific # stuff out of configure() into configure_master() and configure_slave(). From 4df4ed0387e1b4fe9f0d7e1160b3a547aa885679 Mon Sep 17 00:00:00 2001 From: Nicolas Cherel Date: Mon, 3 Jul 2017 22:02:45 +0200 Subject: [PATCH 6/6] Added template_dir to manifests --- flintrock/services.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flintrock/services.py b/flintrock/services.py index 00717e6d..f0bf79e9 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -111,7 +111,10 @@ def __init__(self, *, version, download_source, template_dir): self.version = version self.download_source = download_source self.template_dir = template_dir - self.manifest = {'version': version, 'download_source': download_source} + self.manifest = { + 'version': version, + 'download_source': download_source, + 'template_dir': template_dir} def install( self, @@ -246,6 +249,7 @@ def __init__( 'spark_executor_instances': spark_executor_instances, 'hadoop_version': hadoop_version, 'download_source': download_source, + 'template_dir': template_dir, 'git_commit': git_commit, 'git_repository': git_repository}