diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template index 460552a0..2d45cb9e 100644 --- a/flintrock/config.yaml.template +++ b/flintrock/config.yaml.template @@ -9,12 +9,22 @@ services: # - must be a tar.gz file # download-source: "https://www.example.com/files/spark/{v}/spark-{v}.tar.gz" # executor-instances: 1 + # template-dir: # folder containing spark configuration files: + # - slaves + # - spark-env.sh + # - spark-defaults.conf hdfs: version: 2.7.3 # optional; defaults to download from a dynamically selected Apache mirror # - must contain a {v} template corresponding to the version # - must be a .tar.gz file # download-source: "https://www.example.com/files/hadoop/{v}/hadoop-{v}.tar.gz" + # template-dir: # path to the folder containing the hadoop configuration files + # - masters + # - slaves + # - hadoop-env.sh + # - hdfs-site.xml + # - core-site.xml provider: ec2 diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index 04948499..14d855b6 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -218,6 +218,7 @@ def cli(cli_context, config, provider, debug): help="URL to download Hadoop from.", default='http://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-{v}/hadoop-{v}.tar.gz?as_json', show_default=True) +@click.option('--hdfs-template-dir') @click.option('--install-spark/--no-install-spark', default=True) @click.option('--spark-executor-instances', default=1, help="How many executor instances per worker.") @@ -239,6 +240,7 @@ def cli(cli_context, config, provider, debug): help="Git repository to clone Spark from.", default='https://github.com/apache/spark', show_default=True) +@click.option('--spark-template-dir') @click.option('--assume-yes/--no-assume-yes', default=False) @click.option('--ec2-key-name') @click.option('--ec2-identity-file', @@ -281,12 +283,14 @@ def launch( install_hdfs, hdfs_version, hdfs_download_source, + hdfs_template_dir, install_spark, spark_executor_instances, spark_version, spark_git_commit, spark_git_repository, spark_download_source, + spark_template_dir, assume_yes, ec2_key_name, ec2_identity_file, @@ -351,7 +355,11 @@ def launch( check_external_dependency('ssh-keygen') if install_hdfs: - hdfs = HDFS(version=hdfs_version, download_source=hdfs_download_source) + hdfs = HDFS( + version=hdfs_version, + download_source=hdfs_download_source, + template_dir=hdfs_template_dir + ) services += [hdfs] if install_spark: if spark_version: @@ -360,6 +368,7 @@ def launch( version=spark_version, hadoop_version=hdfs_version, download_source=spark_download_source, + template_dir=spark_template_dir ) elif spark_git_commit: logger.warning( @@ -373,6 +382,7 @@ def launch( git_commit=spark_git_commit, git_repository=spark_git_repository, hadoop_version=hdfs_version, + template_dir=spark_template_dir ) services += [spark] diff --git a/flintrock/services.py b/flintrock/services.py index 8515ed11..f0bf79e9 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -107,10 +107,14 @@ def health_check( class HDFS(FlintrockService): - def __init__(self, *, version, download_source): + def __init__(self, *, version, download_source, template_dir): self.version = version self.download_source = download_source - self.manifest = {'version': version, 'download_source': download_source} + self.template_dir = template_dir + self.manifest = { + 'version': version, + 'download_source': download_source, + 'template_dir': template_dir} def install( self, @@ -149,13 +153,18 @@ def configure( cluster: FlintrockCluster): # TODO: os.walk() through these files. template_paths = [ - 'hadoop/conf/masters', - 'hadoop/conf/slaves', - 'hadoop/conf/hadoop-env.sh', - 'hadoop/conf/core-site.xml', - 'hadoop/conf/hdfs-site.xml', + 'masters', + 'slaves', + 'hadoop-env.sh', + 'core-site.xml', + 'hdfs-site.xml', ] + # TODO: throw error if config folder / file can't be found + template_dir = self.template_dir + if template_dir is None: + template_dir = os.path.join(THIS_DIR, "templates/hadoop/conf") + for template_path in template_paths: ssh_check_output( client=ssh_client, @@ -164,7 +173,7 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(THIS_DIR, "templates", template_path), + path=os.path.join(template_dir, template_path), mapping=generate_template_mapping( cluster=cluster, hadoop_version=self.version, @@ -173,7 +182,7 @@ def configure( spark_version='', spark_executor_instances=0, ))), - p=shlex.quote(template_path))) + p=shlex.quote(os.path.join("hadoop/conf/", template_path)))) # TODO: Convert this into start_master() and split master- or slave-specific # stuff out of configure() into configure_master() and configure_slave(). @@ -217,6 +226,7 @@ def __init__( version: str=None, hadoop_version: str, download_source: str=None, + template_dir: str=None, git_commit: str=None, git_repository: str=None ): @@ -230,6 +240,7 @@ def __init__( self.version = version self.hadoop_version = hadoop_version self.download_source = download_source + self.template_dir = template_dir self.git_commit = git_commit self.git_repository = git_repository @@ -238,6 +249,7 @@ def __init__( 'spark_executor_instances': spark_executor_instances, 'hadoop_version': hadoop_version, 'download_source': download_source, + 'template_dir': template_dir, 'git_commit': git_commit, 'git_repository': git_repository} @@ -310,10 +322,15 @@ def configure( cluster: FlintrockCluster): template_paths = [ - 'spark/conf/spark-env.sh', - 'spark/conf/slaves', - 'spark/conf/spark-defaults.conf', + 'spark-env.sh', + 'slaves', + 'spark-defaults.conf', ] + + template_dir = self.template_dir + if template_dir is None: + template_dir = os.path.join(THIS_DIR, "templates/spark/conf") + for template_path in template_paths: ssh_check_output( client=ssh_client, @@ -322,14 +339,14 @@ def configure( """.format( f=shlex.quote( get_formatted_template( - path=os.path.join(THIS_DIR, "templates", template_path), + path=os.path.join(template_dir, template_path), mapping=generate_template_mapping( cluster=cluster, spark_executor_instances=self.spark_executor_instances, hadoop_version=self.hadoop_version, spark_version=self.version or self.git_commit, ))), - p=shlex.quote(template_path))) + p=shlex.quote(os.path.join("spark/conf/", template_path)))) # TODO: Convert this into start_master() and split master- or slave-specific # stuff out of configure() into configure_master() and configure_slave().