nchammas · ncherel · Jul 3, 2017 · Jul 3, 2017 · Jul 3, 2017 · Jul 3, 2017
diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
@@ -9,12 +9,22 @@ services:
     #   - must be a tar.gz file
     # download-source: "https://www.example.com/files/spark/{v}/spark-{v}.tar.gz"
     # executor-instances: 1
+    # template-dir: # folder containing spark configuration files:
+    #   - slaves
+    #   - spark-env.sh
+    #   - spark-defaults.conf
   hdfs:
     version: 2.7.3
     # optional; defaults to download from a dynamically selected Apache mirror
     #   - must contain a {v} template corresponding to the version
     #   - must be a .tar.gz file
     # download-source: "https://www.example.com/files/hadoop/{v}/hadoop-{v}.tar.gz"
+    # template-dir: # path to the folder containing the hadoop configuration files
+    #   - masters
+    #   - slaves
+    #   - hadoop-env.sh
+    #   - hdfs-site.xml
+    #   - core-site.xml
 
 provider: ec2
 

diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
@@ -218,6 +218,7 @@ def cli(cli_context, config, provider, debug):
               help="URL to download Hadoop from.",
               default='http://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-{v}/hadoop-{v}.tar.gz?as_json',
               show_default=True)
+@click.option('--hdfs-template-dir')
 @click.option('--install-spark/--no-install-spark', default=True)
 @click.option('--spark-executor-instances', default=1,
               help="How many executor instances per worker.")
@@ -239,6 +240,7 @@ def cli(cli_context, config, provider, debug):
               help="Git repository to clone Spark from.",
               default='https://github.com/apache/spark',
               show_default=True)
+@click.option('--spark-template-dir')
 @click.option('--assume-yes/--no-assume-yes', default=False)
 @click.option('--ec2-key-name')
 @click.option('--ec2-identity-file',
@@ -281,12 +283,14 @@ def launch(
         install_hdfs,
         hdfs_version,
         hdfs_download_source,
+        hdfs_template_dir,
         install_spark,
         spark_executor_instances,
         spark_version,
         spark_git_commit,
         spark_git_repository,
         spark_download_source,
+        spark_template_dir,
         assume_yes,
         ec2_key_name,
         ec2_identity_file,
@@ -351,7 +355,11 @@ def launch(
     check_external_dependency('ssh-keygen')
 
     if install_hdfs:
-        hdfs = HDFS(version=hdfs_version, download_source=hdfs_download_source)
+        hdfs = HDFS(
+            version=hdfs_version,
+            download_source=hdfs_download_source,
+            template_dir=hdfs_template_dir
+        )
         services += [hdfs]
     if install_spark:
         if spark_version:
@@ -360,6 +368,7 @@ def launch(
                 version=spark_version,
                 hadoop_version=hdfs_version,
                 download_source=spark_download_source,
+                template_dir=spark_template_dir
             )
         elif spark_git_commit:
             logger.warning(
@@ -373,6 +382,7 @@ def launch(
                 git_commit=spark_git_commit,
                 git_repository=spark_git_repository,
                 hadoop_version=hdfs_version,
+                template_dir=spark_template_dir
             )
         services += [spark]
 

diff --git a/flintrock/services.py b/flintrock/services.py
@@ -107,10 +107,14 @@ def health_check(
 
 
 class HDFS(FlintrockService):
-    def __init__(self, *, version, download_source):
+    def __init__(self, *, version, download_source, template_dir):
         self.version = version
         self.download_source = download_source
-        self.manifest = {'version': version, 'download_source': download_source}
+        self.template_dir = template_dir
+        self.manifest = {
+            'version': version,
+            'download_source': download_source,
+            'template_dir': template_dir}
 
     def install(
             self,
@@ -149,13 +153,18 @@ def configure(
             cluster: FlintrockCluster):
         # TODO: os.walk() through these files.
         template_paths = [
-            'hadoop/conf/masters',
-            'hadoop/conf/slaves',
-            'hadoop/conf/hadoop-env.sh',
-            'hadoop/conf/core-site.xml',
-            'hadoop/conf/hdfs-site.xml',
+            'masters',
+            'slaves',
+            'hadoop-env.sh',
+            'core-site.xml',
+            'hdfs-site.xml',
         ]
 
+        # TODO: throw error if config folder / file can't be found
+        template_dir = self.template_dir
+        if template_dir is None:
+            template_dir = os.path.join(THIS_DIR, "templates/hadoop/conf")
+
         for template_path in template_paths:
             ssh_check_output(
                 client=ssh_client,
@@ -164,7 +173,7 @@ def configure(
                 """.format(
                     f=shlex.quote(
                         get_formatted_template(
-                            path=os.path.join(THIS_DIR, "templates", template_path),
+                            path=os.path.join(template_dir, template_path),
                             mapping=generate_template_mapping(
                                 cluster=cluster,
                                 hadoop_version=self.version,
@@ -173,7 +182,7 @@ def configure(
                                 spark_version='',
                                 spark_executor_instances=0,
                             ))),
-                    p=shlex.quote(template_path)))
+                    p=shlex.quote(os.path.join("hadoop/conf/", template_path))))
 
     # TODO: Convert this into start_master() and split master- or slave-specific
     #       stuff out of configure() into configure_master() and configure_slave().
@@ -217,6 +226,7 @@ def __init__(
         version: str=None,
         hadoop_version: str,
         download_source: str=None,
+        template_dir: str=None,
         git_commit: str=None,
         git_repository: str=None
     ):
@@ -230,6 +240,7 @@ def __init__(
         self.version = version
         self.hadoop_version = hadoop_version
         self.download_source = download_source
+        self.template_dir = template_dir
         self.git_commit = git_commit
         self.git_repository = git_repository
 
@@ -238,6 +249,7 @@ def __init__(
             'spark_executor_instances': spark_executor_instances,
             'hadoop_version': hadoop_version,
             'download_source': download_source,
+            'template_dir': template_dir,
             'git_commit': git_commit,
             'git_repository': git_repository}
 
@@ -310,10 +322,15 @@ def configure(
             cluster: FlintrockCluster):
 
         template_paths = [
-            'spark/conf/spark-env.sh',
-            'spark/conf/slaves',
-            'spark/conf/spark-defaults.conf',
+            'spark-env.sh',
+            'slaves',
+            'spark-defaults.conf',
         ]
+
+        template_dir = self.template_dir
+        if template_dir is None:
+            template_dir = os.path.join(THIS_DIR, "templates/spark/conf")
+
         for template_path in template_paths:
             ssh_check_output(
                 client=ssh_client,
@@ -322,14 +339,14 @@ def configure(
                 """.format(
                     f=shlex.quote(
                         get_formatted_template(
-                            path=os.path.join(THIS_DIR, "templates", template_path),
+                            path=os.path.join(template_dir, template_path),
                             mapping=generate_template_mapping(
                                 cluster=cluster,
                                 spark_executor_instances=self.spark_executor_instances,
                                 hadoop_version=self.hadoop_version,
                                 spark_version=self.version or self.git_commit,
                             ))),
-                    p=shlex.quote(template_path)))
+                    p=shlex.quote(os.path.join("spark/conf/", template_path))))
 
     # TODO: Convert this into start_master() and split master- or slave-specific
     #       stuff out of configure() into configure_master() and configure_slave().