From 8e0d655ae5f1258e3b881f90aa67854a03d8de38 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 13:52:15 -0500 Subject: [PATCH 1/6] remove deprecated links from mkdocs.yml (SOFTWARE-5162) --- mkdocs.yml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 11d5c8c06..e2df71069 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -62,21 +62,6 @@ nav: - 'Troubleshooting Gratia': 'other/troubleshooting-gratia.md' - 'Install Transfer Log Filebeats': 'other/schedd-filebeats.md' - 'Install OSG Token Renewal Service': 'other/osg-token-renewer.md' - - Deprecated: - - POSIX Storage: - - 'Install GridFTP Server': 'data/gridftp.md' - - 'Install Load Balanced GridFTP': 'data/load-balanced-gridftp.md' - - HDFS: - - 'HDFS Overview': 'data/hadoop-overview.md' - - 'Install HDFS': 'data/install-hadoop.md' - - Security: - - 'LCMAPS VOMS authentication': 'security/lcmaps-voms-authentication.md' - - 'User Certificates': 'security/user-certs.md' - - 'Install GSI-enabled SSH': 'other/gsissh.md' - - 'Install RSV': 'monitoring/install-rsv.md' - - 'Advanced RSV Configuration': 'monitoring/advanced-rsv-configuration.md' - - 'Manage RSV via rsv-control': 'monitoring/rsv-control.md' - - 'RSV GlideinWMS Tester': 'monitoring/install-rsv-gwms-tester.md' - Access Point: - 'Install an OSPool Access Point': 'submit/osg-flock.md' - 'Acceptable Use Policy': 'submit/ap-ospool-aup.md' From 31b130ca58dec5f259d9b36723593d77b1494989 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 14:03:38 -0500 Subject: [PATCH 2/6] drop deprecated docs (SOFTWARE-5162) --- docs/data/gridftp.md | 308 ----- docs/data/hadoop-overview.md | 111 -- docs/data/install-hadoop.md | 1014 ----------------- docs/data/load-balanced-gridftp.md | 371 ------ docs/monitoring/advanced-rsv-configuration.md | 190 --- docs/monitoring/install-rsv-gwms-tester.md | 220 ---- docs/monitoring/install-rsv.md | 358 ------ docs/monitoring/rsv-control.md | 319 ------ docs/other/gsissh.md | 156 --- docs/security/lcmaps-voms-authentication.md | 486 -------- docs/security/user-certs.md | 207 ---- 11 files changed, 3740 deletions(-) delete mode 100644 docs/data/gridftp.md delete mode 100644 docs/data/hadoop-overview.md delete mode 100644 docs/data/install-hadoop.md delete mode 100644 docs/data/load-balanced-gridftp.md delete mode 100644 docs/monitoring/advanced-rsv-configuration.md delete mode 100644 docs/monitoring/install-rsv-gwms-tester.md delete mode 100644 docs/monitoring/install-rsv.md delete mode 100644 docs/monitoring/rsv-control.md delete mode 100644 docs/other/gsissh.md delete mode 100644 docs/security/lcmaps-voms-authentication.md delete mode 100644 docs/security/user-certs.md diff --git a/docs/data/gridftp.md b/docs/data/gridftp.md deleted file mode 100644 index a0f9bb553..000000000 --- a/docs/data/gridftp.md +++ /dev/null @@ -1,308 +0,0 @@ -title: Installing and Maintaining a GridFTP Server - -Installing and Maintaining a GridFTP Server -=========================================== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -About This Guide ----------------- - -This page explains how to install the stand-alone Globus GridFTP server, which is an extension of the File Transfer Protocol (FTP) for grid computing. -The aim of GridFTP is to provide a more reliable and high performance file transfer. - -The `osg-gridftp` package contains components necessary to set up a stand-alone GridFTP server and tools used to monitor and report its performance. -A stand-alone GridFTP server might be used under the following circumstances: - -- You are serving VOs that use storage heavily (CMS, ATLAS, CDF, and D0) and your site has more than 250 cores -- Your site will be managing more than 50 TB of disk space -- You want a simple front-end to a filesystem allowing access over WAN - for example NFS. - -!!! note - This document is for a standalone GridFTP server on top of POSIX storage. We have two specialized documents - for Hadoop Distributed File System (HDFS) and XRootD based storage: - - - [Install and configure a GridFTP server on top of HDFS.](install-hadoop.md#gridftp-configuration) - - [Install and configure a GridFTP server on top of XRootD.](xrootd/install-storage-element.md#optional-installing-a-gridftp-server) - -Before Starting ---------------- - -Before starting the installation process you will need to fulfill these prerequisites. - - -- Ensure the host has [a supported operating system](../release/supported_platforms.md) -- Obtain root access to the host -- Prepare [the required Yum repositories](../common/yum.md) -- Install [CA certificates](../common/ca.md) -- SSL Certificate: The GridFTP service uses a host certificate at `/etc/grid-security/hostcert.pem` and an accompanying key at `/etc/grid-security/hostkey.pem` -- Network ports: GridFTP listens on TCP port 2811 and the list of ports configured by the `GLOBUS_TCP_SOURCE_RANGE` environment variable. - -Installing GridFTP ------------------- - -First, you will need to install the GridFTP meta-package: - -```console -root@host # yum install osg-gridftp -``` - -Configuring GridFTP -------------------- - -### Configuring authentication - -To configure which virtual organizations and users are allowed to use your GridFTP server, follow the instructions in -[the LCMAPS VOMS plugin document](../security/lcmaps-voms-authentication.md#configuring-the-lcmaps-voms-plugin). - -### Set port ranges -As mentioned above, GridFTP uses port 2811 for control communication as well as a range of ports for the data transfer. -This range of ports has to defined by setting the variable `GLOBUS_TCP_PORT_RANGE` within the configuration file: -`/etc/sysconfig/globus-gridftp-server` as it is shown in the next example. -This range has to be open within your firewall for inbound communication. - - :::file - $GLOBUS_TCP_PORT_RANGE 50000,51000 - - -### Optional configuration - -#### Setting transfer limits for GridFTP-HDFS - -To set a limit on the total or per-user number of transfers, create `/etc/sysconfig/gridftp-hdfs` and set the following configuration: - - :::file hl_lines="3" - export GRIDFTP_TRANSFER_LIMIT="80" - export GRIDFTP_DEFAULT_USER_TRANSFER_LIMIT="50" - export GRIDFTP__USER_TRANSFER_LIMIT="40" - -In the above configuration: - -- There would be no more than 80 transfers going at a time, across all users. -- By default, any single user can have no more than 50 transfers at a time. -- The `` user has a more stringent limit of 40 transfers at a time. - - -!!!note - This limits are per gridftp server. If you have several gridftp servers you may want to have this limits divided by the number of gridftp servers at your site. - -#### Modifying the environment - -Environment variables are stored in `/etc/sysconfig/globus-gridftp-server` which is sourced on service startup. If you want to change LCMAPS log levels, or GridFTP port ranges, you can edit them there. - -```shell -#Uncomment and modify for firewalls -#export GLOBUS_TCP_PORT_RANGE=min,max -#export GLOBUS_TCP_SOURCE_RANGE=min,max -``` - -Note that the variables `GLOBUS_TCP_PORT_RANGE` and `GLOBUS_TCP_SOURCE_RANGE` can be set here to allow GridFTP to navigate around firewall rules (these affect the inbound and outbound ports, respectively). - -To troubleshoot LCMAPS authorization, you can add the following to `/etc/sysconfig/globus-gridftp-server` and choose a higher debug level: - -``` file -# level 0: no messages, 1: errors, 2: also warnings, 3: also notices, -# 4: also info, 5: maximum debug -LCMAPS_DEBUG_LEVEL=2 -``` - -Output goes to `/var/log/messages` by default. Do not set logging to 5 on any production systems as that may cause systems to slow down significantly or become unresponsive. - -#### Configuring a multi-homed server - -The GridFTP uses control connections, data connections and IPC connections. By default it listens in all interfaces but this can be changed by editing the configuration file `/etc/gridftp.conf`. - -To use a single interface you can set `hostname` to the Hostname or IP address to use: - -```text -hostname IP-TO-USE -``` - -You can also set separately the `control_interface`, `data_interface` and `ipc_interface`. On systems that have multiple network interfaces, you may want to associate data transfers with the fastest possible NIC available. This can be done in the GridFTP server by setting `data_interface`: - -```text -control_interface IP-TO-USE -data_interface IP-TO-USE -ipc_interface IP-TO-USE -``` - -For more options available for the GridFTP server, read the comments in the configuration file (`/etc/gridftp.conf`) or -see the [GridFTP manual](https://gridcf.org/gct-docs/latest/gridftp/admin/index.html). - - -Monitoring -------------------- - -### Enabling GridFTP transfer probe - -The OSG monitoring of GridFTP is carried out by the GridFTP Gratia Probe which is installed by the package: -`gratia-probe-gridftp-transfer` -Assuming you installed GridFTP using the `osg-gridftp` or `osg-se-hadoop-gridftp` RPM, this package will -already be installed. - -Here are the most relevant file and directory locations: - -| Purpose | Needs Editing? | Location | -|---------------------|----------------|------------------------------------------| -| Probe Configuration | Yes | /etc/gratia/gridftp-transfer/ProbeConfig | -| Probe Executables | No | /usr/share/gratia/gridftp-transfer | -| Log files | No | /var/log/gratia | -| Temporary files | No | /var/lib/gratia/tmp | - -The RPM installs the Gratia probe into the system crontab, but does not configure it. The configuration of the probe is controlled by the file - - /etc/gratia/gridftp-transfer/ProbeConfig - -This is usually one XML node spread over multiple lines. Note that comments (\#) have no effect on this file. You will need to edit the following: - -| Attribute | Needs Editing | Value | -|---------------------------------|--------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------| -| ProbeName | Maybe | This should be set to "gridftp-transfer:``", where `` is the fully-qualified domain name of your gridftp host. | -| CollectorHost | Maybe | Set to the hostname and port of the central collector. By default it sends to the OSG collector. See below. | -| SiteName | Yes | Set to the resource group name of your site as registered in OIM. | -| GridftpLogDir | Yes | Set to /var/log, or wherever your current gridftp logs are located | -| Grid | Maybe | Set to "ITB" if this is a test resource; otherwise, leave as OSG. | -| UserVOMapFile | No | This should be set to /var/lib/osg/user-vo-map; see below for information about this file. | -| SuppressUnknownVORecords| Maybe | Set to 1 to suppress any records that can't be matched to a VO; 0 is strongly recommended. | -| SuppressNoDNRecords | Maybe | Set to 1 to suppress records that can't be matched to a DN; 0 is strongly recommended. | -| EnableProbe | Yes | Set to 1 to enable the probe. | - -### Selecting a collector host ### - -The collector is the central server which logs the GridFTP transfers into a database. There are usually two options: - -1. **OSG Transfer Collector**: This is the primary collector for transfers in the OSG. Use `CollectorHost="gratia-osg-prod.opensciencegrid.org:80"`. -1. **OSG-ITB Transfer Collector**: This is the test collector for transfers in the OSG. Use `CollectorHost=" gratia-osg-itb.opensciencegrid.org:80"`. - - -Managing GridFTP ----------------- - -In addition to the GridFTP service itself, there are a number of supporting services in your installation. The specific services are: - -| Software | Service name | Notes | -|:----------|:--------------------------------------|:---------------------------------------------------------------------------------------| -| Fetch CRL | `fetch-crl-boot` and `fetch-crl-cron` | See [CA documentation](../common/ca.md#managing-certificate-revocation-lists) for more info | -| Gratia | `gratia-probes-cron` | Accounting software | -| GridFTP | `globus-gridftp-server` | | - - -Start the services in the order listed and stop them in reverse order. As a reminder, here are common service commands (all run as `root`): - -| To... | Run the command... | -| :-------------------------------------- | :-------------------------------------------- | -| Start a service | `systemctl start ` | -| Stop a service | `systemctl stop ` | -| Enable a service to start on boot | `systemctl enable ` | -| Disable a service from starting on boot | `systemctl disable ` | - - -Validation ------------------- - -### GridFTP - - -1. Acquire a [user certificate](../security/user-certs.md) -1. Find your subject DN: - - :::console - user@host # openssl x509 -in -noout -subject - -1. [Map your DN](../security/lcmaps-voms-authentication.md#mapping-users) to a non-root user. -1. As the non-root user, generate your proxy - - :::console - user@host # voms-proxy-init - -1. Create a test file to be transfered - - :::console - user@host # echo "Hello World!" > /tmp/hello_world - -1. Transfer the file we just created - - :::console - user@host # globus-url-copy file:///tmp/hello_world gsiftp://yourhost.yourdomain/tmp/hello_world - -1. To verify that the authentication is working, we could remove our proxy and execute the last command again, this time it should fail. - - :::console - user@host # voms-proxy-destroy - user@host # globus-url-copy file:///tmp/hello_world gsiftp://yourhost.yourdomain/tmp/hello_world - -!!! warning - Keep in mind that when invoked as root, `globus-url-copy` will attempt to use the host certificate instead of your user certificate, which could produce confusing results. - - -!!!note - If the binary `globus-url-copy` is not available on your system, you can get it by installing `globus-gass-copy-progs`: - - ::console - root@host # yum install globus-gass-copy-progs - - -### Gratia Probe - -1. Run the Gratia probe once by hand to check for functionality: - - :::console - root@host # /usr/share/gratia/gridftp-transfer/gridftp-transfer_meter - - -1. Look in the log files in `/var/log/gratia/.log` and make sure there are no error messages printed. -Look for any abnormal termination and [report it](#getting-help) if it is a non-trivial site issue. - - - -Getting Help ------------- - -For assistance, please use [this page](../common/help.md). - -Reference ---------- - -- [GridFTP administration manual](https://gridcf.org/gct-docs/latest/gridftp/admin/index.html) -- [GridFTP tutorial](http://www.mcs.anl.gov/~mlink/tutorials/GridFTPTutorialHandout.pdf) - -### Configuration and Log Files - -| Service/Process | Configuration File | Description | -|:----------------|:--------------------------------------------------------|:------------------------------------------------------------| -| GridFTP | `/etc/sysconfig/globus-gridftp-server` | Environment variables for GridFTP and LCMAPS | -| | `/usr/share/osg/sysconfig/globus-gridftp-server-plugin` | Where environment variables for GridFTP plugin are included | -| Gratia Probe | `/etc/gratia/gridftp-transfer/ProbeConfig` | GridFTP Gratia Probe configuration | -| Gratia Probe | `/etc/cron.d/gratia-probe-gridftp-transfer.cron` | Cron tab file | - -| Service/Process | Log File | Description | -|:----------------|:----------------------------|:--------------------------| -| GridFTP | `/var/log/gridftp.log` | GridFTP transfer log | -| | `/var/log/gridftp-auth.log` | GridFTP authorization log | -| Gratia probe | `/var/logs/gratia` | | - -### Certificates - -| Certificate | User that owns certificate | Path to certificate | -|:-----------------|:---------------------------|:------------------------------------------------------------------------------| -| Host certificate | `root` | `/etc/grid-security/hostcert.pem` and `/etc/grid-security/hostkey.pem` | - -[Instructions](../security/host-certs.md) to request a service certificate. - -Make sure you have installed the [CA certificates](../common/ca.md) - -### Users - -For this package to function correctly, you will have to create the users needed for grid operation. Any Unix username that can be mapped by LCMAPS VOMS should be created on the GridFTP host. - -For example, VOs newly-added to the LCMAPS VOMS configuration will not be able to transfer files until the corresponding Unix user account is created. - -### Networking - -| Service Name | Protocol | Port Number | Inbound | Outbound | Comment | -|:------------------------|:---------|:--------------------------|:--------|:---------|:----------------------------------------| -| GridFTP data channels | tcp | `GLOBUS_TCP_PORT_RANGE` | X | | contiguous range of ports is necessary. | -| GridFTP data channels | tcp | `GLOBUS_TCP_SOURCE_RANGE` | | X | contiguous range of ports is necessary. | -| GridFTP control channel | tcp | 2811 | X | | | diff --git a/docs/data/hadoop-overview.md b/docs/data/hadoop-overview.md deleted file mode 100644 index b72231c20..000000000 --- a/docs/data/hadoop-overview.md +++ /dev/null @@ -1,111 +0,0 @@ -title: Hadoop Overview - -Hadoop Overview -=============== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -Hadoop Introduction -------------------- - -Hadoop is a data processing framework. -The framework has two main parts - job scheduling and a distributed file system, -the Hadoop Distributed File System (HDFS). - -We currently utilize HDFS as a general-purpose file system. For this document, -we'll use the words "Hadoop" and "HDFS" interchangeably, but it's nice to know -the distinction. - - -The HDFS file system has several features, some of which differ a bit from a typical -file system: - -- Each file is broken up into 64 MB or 128 MB chunks (user configurable) - - These chunks are stored on data nodes and served up from there; - - The central namenode manages block locations, the namespace information, and block placement policies. -- HDFS provides a subset of POSIX semantics: - - Random-access reads and non-random-access writes are fully supported. - - Fsync and appends (after the file has been initially closed) are experimental and not available to OSG-based installs. - - Rewriting closed files is not supported - -Hadoop SE Components --------------------- - -We broadly break down the server components of the Hadoop SE into three categories: HDFS core, Grid extensions, and HDFS -auxiliary. -The components in each of these categories are outlined below: - -- HDFS Core: - - Namenode: The core metadata server of Hadoop. This is the most critical piece of the system, and there can only be one of these. This stores both the file system image and the file system journal. The namenode keeps all of the filesystem layout information (files, blocks, directories, permissions, etc) and the block locations. The filesystem layout is persisted on disk and the block locations are kept solely in memory. When a client opens a file, the namenode tells the client the locations of all the blocks in the file; the client then no longer needs to communicate with the namenode for data transfer. - - Datanode: This node stores copies of the blocks in HDFS. They communicate with the namenode to perform "housekeeping" such as creating new replicas, transferring blocks between datanodes, and deleting excess blocks. They also communicate with the clients to transfer data. To reach the best scalability, there should be as many datanodes as possible. -- Grid extensions - - Globus GridFTP: The standard GridFTP from Globus. We use a plug-in module (using the Globus Direct Storage Interface) that allows the GridFTP process to use the HDFS C-bindings directly. - - Gratia probe: Gratia is an accounting system that records batch system and transfer records to a database. The records are collected by a client program called a "probe" which runs on the GridFTP or XRootD server. The probe parses the GridFTP or XRootD logs and generates transfer records. - - XRootD server plugin: XRootD is an extremely flexible and powerful data server popular in the high energy physics community. There exists a HDFS plugin for XRootD; integrating with XRootD provides a means to export HDFS securely outside the local cluster, as another XRootD plugin provides GSI-based authentication and authorization. -- HDFS auxiliary: - - "Secondary Namenode": Perhaps more aptly called a "checkpoint server". This server downloads the file system image and journal from the namenode, merges the two together, and uploads the new file system image up to the namenode. This is done on a different server in order to reduce the memory footprint of the namenode. - - Hadoop Balancer: This is a script (unlike the others, which are daemons) that runs on the namenode. It requests transfers of random blocks between the datanodes. This works until all datanodes have approximately the same percentage of free space. Well-balanced datanodes are necessary for having a healthy cluster. - -In addition to the server components, there are two client components: - -- FUSE: This allows HDFS to be mounted as a filesystem on the worker nodes. FUSE is a Linux kernel module that allows kernel I/O calls to be translated into a call to a userspace program. In this case, a program called fuse\_dfs translates the POSIX calls into HDFS C-binding calls. -- Hadoop Command Line Client: This command line client exposes a lot of the Unix-like calls without mounting FUSE, plus access to the non-POSIX calls (such as setting quotas and file replication levels). For example, "hadoop fs -ls /" is equivalent to "ls /mnt/hadoop" if /mnt/hadoop is the mount point of HDFS. - --------------------- - -- Namenode: We recommend at least 8GB of RAM (minimum is 2GB RAM), preferably 16GB or more. A rough rule of thumb is 1GB per 100TB of raw disk space; the actual requirements is around 1GB per million objects (files, directories, and blocks). The CPU requirements are any modern multi-core server CPU. Typically, the namenode will only use 2-5% of your CPU. - - As this is a single point of failure, the **most important** requirement is reliable hardware rather than high performance hardware. We suggest a node with redundant power supplies and at least 2 hard drives. -- Secondary namenode: This node needs the same amount of RAM as the namenode for merging namespaces. It does not need to be high performance or high reliability. -- Datanode: Each datanode should plan to dedicate about 1-1.5 GB of RAM to HDFS. A general rule of thumb is to dedicate 1 CPU to HDFS per 5TB of disk capacity under heavily load; clusters with moderate load (i.e., mostly sequential workflows) will need less. At idle, HDFS will consume almost no CPU. - -Sizing Your Cluster ---------------------- - -The minimal installation would involve 5 nodes: - -- hadoop-name: The namenode for the Hadoop system. -- hadoop-name2: This will run the HDFS secondary namenode. -- hadoop-data1, hadoop-data2: Two HDFS datanodes. They will hold data for the system, so they should have sizable hard drives. As the Hadoop installation grows to many terabytes, this will be the only class of nodes one adds. -- hadoop-grid: Runs the Globus GridFTP server. - -If desired, hadoop-name and hadoop-name2 may be virtualized. -Prior to installation, DNS / host name resolution **must** work. -That is, you should be able to resolve all the Hadoop servers either through DNS or /etc/hosts. -Because of the grid software, hadoop-grid **must** have reverse DNS working. - -Larger clusters have the same basic components but with more HDFS datanodes and gridftp servers. -Adding HDFS datanodes increases the capacity and number of IOPS the cluster can provide. -Additional GridFTP servers will increase the data transfer rates to locations outside your data center. -As your cluster increases in size, virtualized namenodes may need to be moved to physical hardware. - -Hadoop Security ---------------- - -HDFS has Unix-like user/group authorization, but no strict authentication. -**HDFS should use a secure internal network which only non-malicious users are able to access**. -For users with access to the local cluster, it is not difficult to bypass authentication. - -[The default ports are listed here](http://www.cloudera.com/blog/2009/08/14/hadoop-default-ports-quick-reference/). - -There are some ways to improve security of your cluster: - -- Keep the namenode behind a firewall. One possibility is to run Hadoop entirely on the private subnet of a cluster. -- Use firewalls to protect the HDFS ports (default for the datanode is 50010 and 50075; for the namenode, 50070 and 9000). -- For clusters utilizing FUSE, one can block outgoing connections to the HDFS ports except for user root. This means that only root-owned processes (such as FUSE-DFS) will be able to access Hadoop. - - This is sufficient for grid environments, but does not protect one in the case where the attacker has physical access to the network switch. -- There exists another option, currently untested. It is possible to limit all HDFS socket connections to SSL-based sockets. Using this to only allow known hosts to connect to HDFS and only allowing FUSE-DFS to connect on those known hosts, one might be able to satisfy even fairly stringent security folks (but not paranoid ones). - -There are three options to export your data outside your cluster: - -- Globus GridFTP. -- XRootD. -- HTTP and HTTPS. OSG utilizes the HTTP(S) protocol implementation built into the XRootD server. - - -References ----------- - -- [Hadoop Architecture](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) - diff --git a/docs/data/install-hadoop.md b/docs/data/install-hadoop.md deleted file mode 100644 index e5d427010..000000000 --- a/docs/data/install-hadoop.md +++ /dev/null @@ -1,1014 +0,0 @@ -title: Installing and Maintaining HDFS - -Installing and Maintaining HDFS -=============================== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -[Hadoop Distributed File System](http://hadoop.apache.org/) (HDFS) is a scalable, reliable distributed file system developed in the Apache project. It is based on the map-reduce framework and design of the Google file system. The OSG distribution of Hadoop includes all components needed to operate a multi-terabyte storage site. - -The purpose of this document is to provide Hadoop-based Storage Element administrators the information on how to prepare, -install and validate OSG storage based on the Hadoop Distributed File System (HDFS). -The OSG supports a patched version HDFS from Cloudera's CDH5 distribution of HDFS -(). - -!!! note - The OSG only supports HDFS on EL7 hosts - -Before Starting ---------------- - -Before starting the installation process, consider the following points (consulting [the Reference section below](#references) as needed): - -- **User IDs:** If they do not exist already, the installation will create the Linux users `hdfs` and `zookeeper` on all nodes - as well as `hadoop` and `mapred` on the NameNodes -- **Firewall:** In the OSG, HDFS is intended to run as an internal service without any direct, external access to any of the nodes. - For more information on the ports used for communication between the various HDFS nodes, see the - [Cloudera documentation](https://www.cloudera.com/documentation/cdh/5-0-x/CDH5-Installation-Guide/cdh5ig_ports_cdh5.html). - -As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: - -- Ensure the host has [a supported operating system](../release/supported_platforms.md) -- Obtain root access to the host -- Prepare the [required Yum repositories](../common/yum.md) - -Designing Your HDFS Cluster ---------------------------- - -There are several important components to an HDFS installation: - -- **NameNode**: The NameNode functions as the directory server and coordinator of the HDFS cluster. - It houses all the meta-data for the hadoop cluster. -- **Secondary NameNode (optional)**: This is a secondary machine that periodically merges updates to the HDFS file - system back into the `fsimage`. - It must share a directory with the primary NameNode to exchange filesystem checkpoints. - An HDFS installation with a Secondary NameNode dramatically improves startup and restart times. -- **DataNode**: You will have many DataNodes. Each DataNode stores large blocks of files to for the hadoop cluster. -- **Client**: This is a documentation shorthand that refers to any machine with the hadoop client commands or - [FUSE](https://en.wikipedia.org/wiki/Filesystem_in_Userspace) mount. - -Installing HDFS ---------------- - -An OSG HDFS installation consists of HDFS and other support software (e.g., Gratia accounting). -To simplify installation, OSG provides convenience RPMs that install all required software. - -1. Clean yum cache: - - ::console - root@host # yum clean all --enablerepo=* - -1. Update software: - - :::console - root@host # yum update - This command will update **all** packages - -1. Install the relevant packages based on the node you are installing: - - | If you are installing a(n)... | Then run the following command... | - | :---------------------------- | :-------------------------------- | - | Primary NameNode | `yum install osg-se-hadoop-namenode` | - | Secondary NameNode | `yum install osg-se-hadoop-secondarynamenode` | - | DataNode | `yum install osg-se-hadoop-datanode` | - - -Upgrading HDFS --------------- - -This section will guide you through the process to upgrade a HDFS 2.0.0 installation from OSG 3.3 to the HDFS 2.6.0 -from OSG 3.5. - -!!! warning - The upgrade process will involve downtime for your HDFS cluster. Please plan accordingly. - -!!! note - The OSG only offers HDFS 2.6.0 for EL7 hosts. - -The upgrade process occurs in several steps: - -1. [Preparing for the upgrade](#preparing-for-the-upgrade) -1. [Updating to OSG 3.5](#updating-to-osg-35) -1. [Upgrading the Primary NameNode](#upgrading-the-primary-namenode) -1. [Upgrading the DataNodes](#upgrading-the-datanodes) -1. [Upgrading the Secondary NameNode](#upgrading-the-secondary-namenode) -1. [Finalizing the upgrade](#finalizing-the-upgrade) - -### Preparing for the upgrade ### - -Before upgrading, backup your configuration data and HDFS metadata. - -1. Put your Primary NameNode into safe mode: - - :::console - root@primary-namenode # hdfs dfsadmin -safemode enter - Safe mode is ON - -1. Save a clean copy of your HDFS namespace: - - :::console - root@primary-namenode # hdfs dfsadmin -saveNamespace - Save namespace successful - -1. Shutdown the HDFS services on all of your HDFS nodes (see [this section](#running-services) for instructions). - -1. On the Primary NameNode, verify that your NameNode service is off: - - :::console - root@primary-namenode # /etc/init.d/hadoop-hdfs-namenode status - - This command should indicate that your NameNode service is not running. - -1. Find the location of the directory with the HDFS metadata: - - :::console - root@primary-namenode # grep -C1 dfs.namenode.name.dir /etc/hadoop/conf/hdfs-site.xml - - And look for the value of `dfs.namenode.name.dir`: - - :::xml - - dfs.namenode.name.dir - file:///var/lib/dfs/nn,file:///home/hadoop/dfs/nn - -1. Backup the directory that appears in the output using your backup method of choice. - If more than one directory appears in the list (as in the example above), choose the most convenient directory. - All of the directories in the list will have the same contents. - -### Updating to OSG 3.5 ### - -Once your HDFS services have been turned off and the HDFS metadata has been backed up, update each node to OSG 3.5 by -following the instructions in [this section](../release/updating-to-osg-35.md). - -### Upgrading the Primary NameNode ### - -To upgrade your Primary NameNode, update all relevant packages then run the upgrade command. - -1. Clear the yum cache: - - :::console - root@primary-namenode # yum clean all --enablerepo=* - -1. Update the HDFS RPMs: - - :::console - root@primary-namenode # yum update osg-se-hadoop-namenode --enablerepo-osg-upcoming - -1. Perform the upgrade command: - - :::console - root@primary-namenode # /etc/init.d/hadoop-hdfs-namenode upgrade - - This will start the upgrade process for the HDFS metadata on your primary namenode. - You can follow the process by running - - :::console - root@primary-namenode # tail -f /var/log/hadoop-hdfs/hadoop-hdfs-namenode-.log - -### Upgrading the DataNodes ### - -Once the Primary NameNode has completed its upgrade process, start the process of upgrading each of your DataNodes. - -1. Clear the yum cache: - - :::console - root@datanode # yum clean all --enablerepo=* - -1. Update the HDFS RPMs: - - :::console - root@datanode # yum update osg-se-hadoop-datanode --enablerepo-osg-upcoming - -1. Start the DataNode service: - - :::console - root@datanode # /etc/init.d/hadoop-hdfs-datanode start - -1. After all the DataNodes have been brought back up, the Primary NameNode should exit safe mode automatically. - On the Primary NameNode, run the following command to verify is no longer in safe mode: - - :::console - root@primary-namenode # hdfs dfsadmin -safemode get - Safe mode is OFF - - - -### Upgrading the Secondary NameNode ### - -!!! note - This section only applies to sites with a Secondary NameNode. - If you do not run a Secondary NameNode, skip to the [next section](#finalizing-the-upgrade). - -Once the Primary NameNode has exited safe mode, start the process of upgrading your Secondary NameNode. - -1. Clear the yum cache: - - :::console - root@secondary-namenode # yum clean all --enablerepo=* - -1. Update the HDFS RPMs: - - :::console - root@secondary-namenode # yum update osg-se-hadoop-secondarynamenode --enablerepo-osg-upcoming - -1. Start the Secondary NameNode service: - - :::console - root@secondary-namenode # /etc/init.d/hadoop-hdfs-secondarynamenode start - -### Finalizing the upgrade ### - -1. Verify that the HDFS cluster is running correctly by following the instructions in [this section](#validation). - -1. Finalize the upgrade from the Primary NameNode: - - :::console - root@primary-namenode # hdfs dfsadmin -finalizeUpgrade - Finalize upgrade successful - -Configuring HDFS ----------------- - -!!! note - Needed by: Hadoop NameNode, Hadoop DataNodes, Hadoop client, GridFTP - -Hadoop configuration is needed by every node in the hadoop cluster. However, in most cases, you can do the configuration once and copy it to all nodes in the cluster (possibly using your favorite configuration management tool). Special configuration for various special components is given in the below sections. - -Hadoop configuration is stored in `/etc/hadoop/conf`. However, by default, these files are mostly blank. OSG provides a sample configuration in `/etc/hadoop/conf.osg` with most common values filled in. You will need to copy these into `/etc/hadoop/conf` before they become active. Please let us know if there are any common values that should be added/changed across the whole grid. You will likely need to modify `hdfs-site.xml` and `core-site.xml`. Review all the settings in these files, but listed below are common settings to modify: - -| File | Setting | Example | Comments | -|-----------------|----------------------------|----------------------------------|-------------------------------------------------------------------------------------------| -| `core-site.xml` | fs.default.name | hdfs://namenode.domain.tld.:9000 | This is the address of the NameNode | -| `core-site.xml` | hadoop.tmp.dir | /data/scratch | Scratch temp directory used by Hadoop | -| `core-site.xml` | hadoop.log.dir | /var/log/hadoop-hdfs | Log directory used by Hadoop | -| `core-site.xml` | dfs.umaskmode | 002 | umask for permissions used by default | -| `hdfs-site.xml` | dfs.block.size | 134217728 | Block size: 128MB by default | -| `hdfs-site.xml` | dfs.replication | 2 | Default replication factor. Generally the same as dfs.replication.min/max | -| `hdfs-site.xml` | dfs.datanode.du.reserved | 100000000 | How much free space hadoop will reserve for non-Hadoop usage | -| `hdfs-site.xml` | dfs.datanode.handler.count | 20 | Number of server threads for DataNodes. Increase if you have many more client connections | -| `hdfs-site.xml` | dfs.namenode.handler.count | 40 | Number of server threads for NameNodes. Increase if you need more connections | -| `hdfs-site.xml` | dfs.http.address | namenode.domain.tld.:50070 | Web address for dfs health monitoring page | - -See for more parameters to configure. - -!!! note - NameNodes must have a `/etc/hosts_exclude` present - -#### Special NameNode instructions for brand new installs - -If this is a new installation (**and only if this is a brand new installation**), you should run the following command as the `hdfs` user. (Otherwise, be sure to `chown` your storage directory to hdfs after running): - -``` console -hadoop namenode -format -``` - -This will initialize the storage directory on your NameNode - -### (optional) FUSE Client Configuration ### - -A FUSE mount is required on any node that you would like to use standard POSIX-like commands on the Hadoop filesystem. FUSE (or "File system in User SpacE") is a way to access HDFS using typical UNIX directory commands (i.e., POSIX-like access). Note that not all advanced functions of a full POSIX-compliant file system are necessarily available. - -FUSE is typically installed as part of this installation, but, if you are running a customized or non-standard system, make sure that the fuse kernel module is installed and loaded with `modprobe fuse`. - -You can add the FUSE to be mounted at boot time by adding the following line to `/etc/fstab`: - -``` file -hadoop-fuse-dfs# fuse server=,port=9000,rdbuffer=131072,allow_other 0 0 -``` - -Be sure to change the `` mount point and `` to match your local configuration. To match the help documents, we recommend using `` as your mountpoint. - -Once your `/etc/fstab` is updated, to mount FUSE run: - -``` console -root@host # mkdir /mnt/hadoop -root@host # mount /mnt/hadoop -``` - -When mounting the HDFS FUSE mount, you will see the following harmless warnings printed to the screen: - -``` console -# mount /mnt/hadoop -INFO fuse_options.c:162 Adding FUSE arg /mnt/hadoop -INFO fuse_options.c:110 Ignoring option allow_other -``` - -If you have troubles mounting FUSE refer to [Running FUSE in Debug Mode](#running-fuse-in-debug-mode) in the Troubleshooting section. - -### Creating VO and User Areas ### - -!!! note - Grid Users are needed by GridFTP nodes. VO areas are common to all nodes. - -For this package to function correctly, you will have to create the users needed for grid operation. Any user that can be authenticated should be created. - -For grid-mapfile users, each line of the grid-mapfile is a certificate/user pair. Each user in this file should be created on the server. - -Note that these users must be kept in sync with the authentication method. - -Prior to starting basic day-to-day operations, it is important to create dedicated areas for each VO and/or user. This is similar to user management in simple UNIX filesystems. Create (and maintain) usernames and groups with UIDs and GIDs on **all nodes**. These are maintained in basic system files such as `/etc/passwd` and `/etc/group`. - -!!! note - In the examples below It is assumed a FUSE mount is set to `/mnt/hadoop`. As an alternative `hadoop fs` commands could have been used. - -For clean HDFS operations and filesystem management: - -(a) Create top-level VO subdirectories under `/mnt/hadoop`. - -Example: - -``` console -root@host # mkdir /mnt/hadoop/cms -root@host # mkdir /mnt/hadoop/dzero -root@host # mkdir /mnt/hadoop/sbgrid -root@host # mkdir /mnt/hadoop/fermigrid -root@host # mkdir /mnt/hadoop/cmstest -root@host # mkdir /mnt/hadoop/osg -``` - -(b) Create individual top-level user areas, under each VO area, as needed. - -``` console -root@host # mkdir -p /mnt/hadoop/cms/store/user/tanyalevshina -root@host # mkdir -p /mnt/hadoop/cms/store/user/michaelthomas -root@host # mkdir -p /mnt/hadoop/cms/store/user/brianbockelman -root@host # mkdir -p /mnt/hadoop/cms/store/user/douglasstrain -root@host # mkdir -p /mnt/hadoop/cms/store/user/abhisheksinghrana -``` - -(c) Adjust username:group ownership of each area. - -``` console -root@host # chown -R cms:cms /mnt/hadoop/cms -root@host # chown -R sam:sam /mnt/hadoop/dzero - -root@host # chown -R michaelthomas:cms /mnt/hadoop/cms/store/user/michaelthomas -``` - -### GridFTP Configuration ### - -gridftp-hdfs reads the Hadoop configuration file to learn how to talk to Hadoop. -By now, you should have followed the instruction for installing hadoop as detailed in the previous section as well as -created the proper users/directories. - -The default settings in `/etc/gridftp.conf` along with `/etc/gridftp.d/gridftp-hdfs.conf` are used by the init.d script -and should be ok for most installations. -The file `/etc/gridftp-hdfs/gridftp-debug.conf` is used by `/usr/bin/gridftp-hdfs-standalone` for starting up the -GridFTP server in a testing mode. -Any additional config files under `/etc/gridftp.d` will be used for both the init.d and standalone GridFTP server. -`/etc/sysconfig/gridftp-hdfs` contains additional site-specific environment variables that are used by the gridftp-hdfs -DSI module in both the init.d and standalone GridFTP server. -Some of the environment variables that can be used in `/etc/sysconfig/gridftp-hdfs` include: - -| Option Name | Needs Editing? | Suggested value | -|-----------------------------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `GRIDFTP_HDFS_REPLICA_MAP` | No | File containing a list of paths and replica values for setting the default # of replicas for specific file paths | -| `GRIDFTP_BUFFER_COUNT` | No | The number of 1MB memory buffers used to reorder data streams before writing them to Hadoop | -| `GRIDFTP_FILE_BUFFER_COUNT` | No | The number of 1MB file-based buffers used to reorder data streams before writing them to Hadoop | -| `GRIDFTP_SYSLOG` | No | Set this to 1 in case if you want to send transfer activity data to syslog (only used for the HadoopViz application) | -| `GRIDFTP_HDFS_CHECKSUMS` | Maybe | List of checksum calculations to perform on-the-fly (default: `"MD5,ADLER32,CRC32,CKSUM,CVMFS"`) | -| `GRIDFTP_HDFS_MOUNT_POINT` | Maybe | The location of the FUSE mount point used during the Hadoop installation. Defaults to /mnt/hadoop. This is needed so that gridftp-hdfs can convert fuse paths on the incoming URL to native Hadoop paths. **Note:** this does not imply you need FUSE mounted on GridFTP nodes! | -| `GRIDFTP_LOAD_LIMIT` | No | GridFTP will refuse to start new transfers if the load on the GridFTP host is higher than this number; defaults to 20. | -| `TMPDIR` | Maybe | The temp directory where the file-based buffers are stored. Defaults to /tmp. | - -`/etc/sysconfig/gridftp-hdfs` is also a good place to increase per-process resource limits. For example, many installations will require more than the default number of open files (`ulimit -n`). - -Lastly, you will need to configure an authentication mechanism for GridFTP. - -#### Configuring authentication #### - -For information on how to configure authentication for your GridFTP installation, please refer to the [configuring authentication section of the GridFTP guide](gridftp.md#configuring-authentication). - -### GridFTP Gratia Transfer Probe Configuration ### - -!!! note - Needed by GridFTP node only. - -See the [GridFTP documentation](gridftp.md#enabling-gridftp-transfer-probe) for configuration details. - -### Hadoop Storage Probe Configuration ### - -!!! note - This is only needed by the Hadoop NameNode - -Here are the most relevant file and directory locations: - -| Purpose | Needs Editing? | Location | -|---------------------|----------------|---------------------------------------------------------| -| Probe Configuration | Yes | /etc/gratia/hadoop-storage/ProbeConfig | -| Probe Executable | No | /usr/share/gratia/hadoop-storage/hadoop\_storage\_probe | -| Log files | No | /var/log/gratia | -| Temporary files | No | /var/lib/gratia/tmp | - -The RPM installs the Gratia probe into the system crontab, but does not configure it. The configuration of the probe is controlled by two files - - /etc/gratia/hadoop-storage/ProbeConfig - /etc/gratia/hadoop-storage/storage.cfg - -#### ProbeConfig #### - -This is usually one XML node spread over multiple lines. Note that comments (\#) have no effect on this file. You will need to edit the following: - -| Attribute | Needs Editing | Value | -|---------------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------| -| CollectorHost | Maybe | Set to the hostname and port of the central collector. By default it sends to the OSG collector. You probably do not want to change it. | -| SiteName | Yes | Set to the resource group name of your SE as registered in OIM. | -| Grid | Maybe | Set to "ITB" if this is a test resource; otherwise, leave as OSG. | -| EnableProbe | Yes | Set to 1 to enable the probe. | - -#### storage.cfg #### - -This file controls which paths in HDFS should be monitored. This is in the Windows INI format. - -**Note: for the current version of the storage.cfg, there is an error, and you may need to delete the "probe/" subdirectory for the ProbeConfig location** - -``` file -ProbeConfig = /etc/gratia/probe/hadoop-storage/ProbeConfig -``` - -For each logical "area" (arbitrarily defined by you), specify both a given name and a list of paths that belong to that area. Unix globs are accepted. - -To configure an area named "CMS /store" that monitors the space usage in the paths /user/cms/store/\*, one would add the following to the storage.cfg file. - -``` file -[Area CMS /store] -Name = CMS /store -Path = /user/cms/store/* -Trim = /user/cms -``` - -For each such area, add a section to your configuration file. - -##### Example file ##### - -Below is a configuration file that includes three distinct areas. Note that you shouldn't have to touch the \[Gratia\] section if you edited the ProbeConfig above: - -``` file -[Gratia] -gratia_location = /opt/vdt/gratia -ProbeConfig = %(gratia_location)s/probe/hadoop-storage/ProbeConfig - -[Area /store] -Name = CMS /store -Path = /store/* - -[Area /store/user] -Name = CMS /store/user -Path = /store/user/* - -[Area /user] -Name = Hadoop /user -Path = /user/* -``` - -\***NOTE These lines in the \[gratia\] section are wrong and need to be changed to the following by hand for now until the rpm is updated:** - -``` file -gratia_location = /etc/gratia -ProbeConfig = %(gratia_location)s/hadoop-storage/ProbeConfig -``` - -Running Services ----------------- - - -Start the services in the order listed and stop them in reverse order. As a reminder, here are common service commands (all run as `root`): - -| To... | Run the command... | -| :-------------------------------------- | :-------------------------------------------- | -| Start a service | `systemctl start ` | -| Stop a service | `systemctl stop ` | -| Enable a service to start on boot | `systemctl enable ` | -| Disable a service from starting on boot | `systemctl disable ` | - - -The relevant service for each node is as follows: - -| Node | Service | -| :----------------- | :---------------------------- | -| Primary NameNode | hadoop-hdfs-namenode | -| Secondary NameNode | hadoop-hdfs-secondarynamenode | -| DataNode | hadoop-hdfs-datanode | -| GridFTP | globus-gridftp-server | - - - -Validation ----------- - -The first thing you may want to do after installing and starting your primary NameNode is to verify that the web interface works. In your web browser go to: - -``` file -http://:50070/dfshealth.jsp -``` - -Change `` for the hostname of your Primary NameNode. Get familiar with Hadoop commands. -Run hadoop with no arguments to see the list of commands. - -
- Show detailed ouput -

-``` console -user$ hadoop -Usage: hadoop [--config confdir] COMMAND -where COMMAND is one of: - namenode -format format the DFS filesystem - secondarynamenode run the DFS secondary namenode - namenode run the DFS namenode - datanode run a DFS datanode - dfsadmin run a DFS admin client - mradmin run a Map-Reduce admin client - fsck run a DFS filesystem checking utility - fs run a generic filesystem user client - balancer run a cluster balancing utility - fetchdt fetch a delegation token from the NameNode - jobtracker run the MapReduce job Tracker node - pipes run a Pipes job - tasktracker run a MapReduce task Tracker node - job manipulate MapReduce jobs - queue get information regarding JobQueues - version print the version - jar run a jar file - distcp copy file or directories recursively - archive -archiveName NAME -p * create a hadoop archive - oiv apply the offline fsimage viewer to an fsimage - classpath prints the class path needed to get the - Hadoop jar and the required libraries - daemonlog get/set the log level for each daemon - or - CLASSNAME run the class named CLASSNAME -Most commands print help when invoked w/o parameters. -``` -

-
- -For a list of supported filesystem commands: - -
- Show 'hadoop fs' detailed ouput -

-``` console -user$ hadoop fs -Usage: java FsShell - [-ls ] - [-lsr ] - [-df []] - [-du ] - [-dus ] - [-count[-q] ] - [-mv ] - [-cp ] - [-rm [-skipTrash] ] - [-rmr [-skipTrash] ] - [-expunge] - [-put ... ] - [-copyFromLocal ... ] - [-moveFromLocal ... ] - [-get [-ignoreCrc] [-crc] ] - [-getmerge [addnl]] - [-cat ] - [-text ] - [-copyToLocal [-ignoreCrc] [-crc] ] - [-moveToLocal [-crc] ] - [-mkdir ] - [-setrep [-R] [-w] ] - [-touchz ] - [-test -[ezd] ] - [-stat [format] ] - [-tail [-f] ] - [-chmod [-R] PATH...] - [-chown [-R] [OWNER][:[GROUP]] PATH...] - [-chgrp [-R] GROUP PATH...] - [-help [cmd]] - -Generic options supported are --conf specify an application configuration file --D use value for given property --fs specify a namenode --jt specify a job tracker --files specify comma separated files to be copied to the map reduce cluster --libjars specify comma separated jar files to include in the classpath. --archives specify comma separated archives to be unarchived on the compute machines. - -The general command line syntax is -bin/hadoop command [genericOptions] [commandOptions] -``` -

-
- -An online guide is also available at [Apache Hadoop commands manual](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CommandsManual.html). You can use Hadoop commands to perform filesystem operations with more consistency. - -Example, to look into the internal hadoop namespace: - -``` console -user$ hadoop fs -ls / -Found 1 items -drwxrwxr-x - engage engage 0 2011-07-25 06:32 /engage -``` - -Example, to adjust ownership of filesystem areas (there is usually no need to specify the mount itself `/mnt/hadoop` in Hadoop commands): - -``` console -root@host # hadoop fs -chown -R engage:engage /engage -``` - -Example, compare `hadoop fs` command vs. using FUSE mount: - -``` console -user$ hadoop fs -ls /engage -Found 3 items --rw-rw-r-- 2 engage engage 733669376 2011-06-15 16:55 /engage/CentOS-5.6-x86_64-LiveCD.iso --rw-rw-r-- 2 engage engage 215387183 2011-06-15 16:28 /engage/condor-7.6.1-x86_rhap_5-stripped.tar.gz --rw-rw-r-- 2 engage engage 9259360 2011-06-15 16:32 /engage/glideinWMS_v2_5_1.tgz - -user$ ls -l /mnt/hadoop/engage -total 935855 --rw-rw-r-- 1 engage engage 733669376 Jun 15 16:55 CentOS-5.6-x86_64-LiveCD.iso --rw-rw-r-- 1 engage engage 215387183 Jun 15 16:28 condor-7.6.1-x86_rhap_5-stripped.tar.gz --rw-rw-r-- 1 engage engage 9259360 Jun 15 16:32 glideinWMS_v2_5_1.tgz -``` - -### GridFTP Validation ### - -!!! note - The commands used to verify GridFTP below assume you have access to a node where you can first generate a valid proxy using `voms-proxy-init` or `grid-proxy-init`. Obtaining grid credentials is beyond the scope of this document. - -``` console -user$ globus-url-copy file:///home/users/jdost/test.txt gsiftp://devg-7.t2.ucsd.edu:2811/mnt/hadoop/engage/test.txt -``` - -If you are having troubles running GridFTP refer to [Starting GridFTP in Standalone Mode](#starting-gridftp-in-standalone-mode) in the Troubleshooting section. - -Troubleshooting ---------------- - -### Hadoop ### - -To view all of the currently configured settings of Hadoop from the web interface, enter the following url in your browser: - -``` file -http://:50070/conf -``` - -Change `` for the hostname of your Primary NameNode. You will see the entire configuration in XML -format, for example: - -
- Expand XML configuration -

- -``` file - -fs.s3n.implorg.apache.hadoop.fs.s3native.NativeS3FileSystem -mapred.task.cache.levels2 -map.sort.classorg.apache.hadoop.util.QuickSort -hadoop.tmp.dir/data1/hadoop//scratch -hadoop.native.libtrue -dfs.namenode.decommission.nodes.per.interval5 -dfs.https.need.client.authfalse -ipc.client.idlethreshold4000 -mapred.system.dir${hadoop.tmp.dir}/mapred/system -dfs.datanode.data.dir.perm755 -mapred.job.tracker.persist.jobstatus.hours0 -dfs.namenode.logging.levelall -dfs.datanode.address0.0.0.0:50010 -io.skip.checksum.errorsfalse -dfs.block.access.token.enablefalse -fs.default.namehdfs://nagios.t2.ucsd.edu:9000 -mapred.child.tmp./tmp -fs.har.impl.disable.cachetrue -mapred.skip.reduce.max.skip.groups0 -dfs.safemode.threshold.pct0.999f -mapred.heartbeats.in.second100 -dfs.namenode.handler.count40 -dfs.blockreport.initialDelay0 -mapred.jobtracker.instrumentationorg.apache.hadoop.mapred.JobTrackerMetricsInst -mapred.tasktracker.dns.nameserverdefault -io.sort.factor10 -mapred.task.timeout600000 -mapred.max.tracker.failures4 -hadoop.rpc.socket.factory.class.defaultorg.apache.hadoop.net.StandardSocketFactory -mapred.job.tracker.jobhistory.lru.cache.size5 -fs.hdfs.implorg.apache.hadoop.hdfs.DistributedFileSystem -mapred.skip.map.auto.incr.proc.counttrue -dfs.block.access.key.update.interval600 -mapreduce.job.complete.cancel.delegation.tokenstrue -io.mapfile.bloom.size1048576 -mapreduce.reduce.shuffle.connect.timeout180000 -dfs.safemode.extension30000 -tasktracker.http.threads50 -mapred.job.shuffle.merge.percent0.66 -fs.ftp.implorg.apache.hadoop.fs.ftp.FTPFileSystem -mapred.output.compressfalse -io.bytes.per.checksum4096 -mapred.healthChecker.script.timeout600000 -topology.node.switch.mapping.implorg.apache.hadoop.net.ScriptBasedMapping -dfs.https.server.keystore.resourcessl-server.xml -mapred.reduce.slowstart.completed.maps0.05 -mapred.reduce.max.attempts4 -fs.ramfs.implorg.apache.hadoop.fs.InMemoryFileSystem -dfs.block.access.token.lifetime600 -mapred.skip.map.max.skip.records0 -dfs.name.edits.dir${dfs.name.dir} -hadoop.security.group.mappingorg.apache.hadoop.security.ShellBasedUnixGroupsMapping -mapred.job.tracker.persist.jobstatus.dir/jobtracker/jobsInfo -hadoop.log.dir/var/log/hadoop -fs.s3.buffer.dir${hadoop.tmp.dir}/s3 -dfs.block.size134217728 -job.end.retry.attempts0 -fs.file.implorg.apache.hadoop.fs.LocalFileSystem -mapred.output.compression.typeRECORD -mapred.local.dir.minspacestart0 -dfs.datanode.ipc.address0.0.0.0:50020 -dfs.permissionstrue -topology.script.number.args100 -io.mapfile.bloom.error.rate0.005 -mapred.max.tracker.blacklists4 -mapred.task.profile.maps0-2 -dfs.datanode.https.address0.0.0.0:50475 -dfs.umaskmode002 -mapred.userlog.retain.hours24 -dfs.secondary.http.addressgratia-1:50090 -dfs.replication.max32 -mapred.job.tracker.persist.jobstatus.activefalse -hadoop.security.authorizationfalse -local.cache.size10737418240 -mapred.min.split.size0 -dfs.namenode.delegation.token.renew-interval86400000 -mapred.map.tasks7919 -mapred.child.java.opts-Xmx200m -dfs.https.client.keystore.resourcessl-client.xml -dfs.namenode.startupREGULAR -mapred.job.queue.namedefault -mapred.job.tracker.retiredjobs.cache.size1000 -dfs.https.address0.0.0.0:50470 -dfs.balance.bandwidthPerSec2000000000 -ipc.server.listen.queue.size128 -job.end.retry.interval30000 -mapred.inmem.merge.threshold1000 -mapred.skip.attempts.to.start.skipping2 -fs.checkpoint.dir/var/hadoop/checkpoint-a -mapred.reduce.tasks1543 -mapred.merge.recordsBeforeProgress10000 -mapred.userlog.limit.kb0 -webinterface.private.actionsfalse -dfs.max.objects0 -mapred.job.shuffle.input.buffer.percent0.70 -io.sort.spill.percent0.80 -mapred.map.tasks.speculative.executiontrue -hadoop.util.hash.typemurmur -dfs.datanode.dns.nameserverdefault -dfs.blockreport.intervalMsec3600000 -mapred.map.max.attempts4 -mapreduce.job.acl-view-job -mapred.job.tracker.handler.count10 -dfs.client.block.write.retries3 -mapred.max.reduces.per.node-1 -mapreduce.reduce.shuffle.read.timeout180000 -mapred.tasktracker.expiry.interval600000 -dfs.https.enablefalse -mapred.jobtracker.maxtasks.per.job-1 -mapred.jobtracker.job.history.block.size3145728 -keep.failed.task.filesfalse -dfs.datanode.failed.volumes.tolerated0 -mapred.task.profile.reduces0-2 -ipc.client.tcpnodelayfalse -mapred.output.compression.codecorg.apache.hadoop.io.compress.DefaultCodec -io.map.index.skip0 -ipc.server.tcpnodelayfalse -dfs.namenode.delegation.key.update-interval86400000 -mapred.running.map.limit-1 -jobclient.progress.monitor.poll.interval1000 -dfs.default.chunk.view.size32768 -hadoop.logfile.size10000000 -mapred.reduce.tasks.speculative.executiontrue -mapreduce.tasktracker.outofband.heartbeatfalse -fs.s3n.block.size67108864 -dfs.datanode.du.reserved10000000000 -hadoop.security.authenticationsimple -fs.checkpoint.period3600 -mapred.running.reduce.limit-1 -mapred.job.reuse.jvm.num.tasks1 -dfs.web.ugiwebuser,webgroup -mapred.jobtracker.completeuserjobs.maximum100 -dfs.df.interval60000 -mapred.task.tracker.task-controllerorg.apache.hadoop.mapred.DefaultTaskController -dfs.data.dir/data1/hadoop//data -fs.s3.maxRetries4 -dfs.datanode.dns.interfacedefault -dfs.support.appendtrue -mapreduce.job.acl-modify-job -mapred.local.dir${hadoop.tmp.dir}/mapred/local -fs.hftp.implorg.apache.hadoop.hdfs.HftpFileSystem -dfs.permissions.supergrouproot -fs.trash.interval0 -fs.s3.sleepTimeSeconds10 -mapred.submit.replication10 -dfs.replication.min1 -fs.har.implorg.apache.hadoop.fs.HarFileSystem -mapred.map.output.compression.codecorg.apache.hadoop.io.compress.DefaultCodec -mapred.tasktracker.dns.interfacedefault -dfs.namenode.decommission.interval30 -dfs.http.addressnagios:50070 -mapred.job.trackernagios:9000 -dfs.heartbeat.interval3 -io.seqfile.sorter.recordlimit1000000 -dfs.name.dir${hadoop.tmp.dir}/dfs/name -mapred.line.input.format.linespermap1 -mapred.jobtracker.taskSchedulerorg.apache.hadoop.mapred.JobQueueTaskScheduler -mapred.tasktracker.instrumentationorg.apache.hadoop.mapred.TaskTrackerMetricsInst -dfs.datanode.http.address0.0.0.0:50075 -jobclient.completion.poll.interval5000 -mapred.max.maps.per.node-1 -mapred.local.dir.minspacekill0 -dfs.replication.interval3 -io.sort.record.percent0.05 -fs.kfs.implorg.apache.hadoop.fs.kfs.KosmosFileSystem -mapred.temp.dir${hadoop.tmp.dir}/mapred/temp -mapred.tasktracker.reduce.tasks.maximum4 -dfs.replication2 -fs.checkpoint.edits.dir${fs.checkpoint.dir} -mapred.tasktracker.tasks.sleeptime-before-sigkill5000 -mapred.job.reduce.input.buffer.percent0.0 -mapred.tasktracker.indexcache.mb10 -mapreduce.job.split.metainfo.maxsize10000000 -mapred.skip.reduce.auto.incr.proc.counttrue -hadoop.logfile.count10 -fs.automatic.closetrue -io.seqfile.compress.blocksize1000000 -dfs.hosts.exclude/etc/hadoop-0.20/conf/hosts_exclude -fs.s3.block.size67108864 -mapred.tasktracker.taskmemorymanager.monitoring-interval5000 -mapred.acls.enabledfalse -mapreduce.jobtracker.staging.root.dir${hadoop.tmp.dir}/mapred/staging -mapred.queue.namesdefault -dfs.access.time.precision3600000 -fs.hsftp.implorg.apache.hadoop.hdfs.HsftpFileSystem -mapred.task.tracker.http.address0.0.0.0:50060 -mapred.reduce.parallel.copies5 -io.seqfile.lazydecompresstrue -dfs.safemode.min.datanodes0 -io.sort.mb100 -ipc.client.connection.maxidletime10000 -mapred.compress.map.outputfalse -mapred.task.tracker.report.address127.0.0.1:0 -mapred.healthChecker.interval60000 -ipc.client.kill.max10 -ipc.client.connect.max.retries10 -fs.s3.implorg.apache.hadoop.fs.s3.S3FileSystem -mapred.job.tracker.http.address0.0.0.0:50030 -io.file.buffer.size4096 -mapred.jobtracker.restart.recoverfalse -io.serializationsorg.apache.hadoop.io.serializer.WritableSerialization -mapred.task.profilefalse -dfs.datanode.handler.count10 -mapred.reduce.copy.backoff300 -dfs.replication.considerLoadtrue -jobclient.output.filterFAILED -dfs.namenode.delegation.token.max-lifetime604800000 -mapred.tasktracker.map.tasks.maximum4 -io.compression.codecsorg.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec -fs.checkpoint.size67108864 - -``` -

-
- -Please refer to the [Apache Hadoop documentation](https://hadoop.apache.org/docs/r2.6.0/) for answers to common questions/concerns - -### FUSE ### - -#### Notes on Building a FUSE Module #### - -If you are running a custom kernel, then be sure to enable the `fuse` module with `CONFIG_FUSE_FS=m` in your kernel config. Building and installing a `fuse` kernel module for your custom kernel is beyond the scope of this document. - -#### Running FUSE in Debug Mode #### - -To start the FUSE mount in debug mode, you can run the FUSE mount command by hand: - -``` console -root@host # /usr/bin/hadoop-fuse-dfs /mnt/hadoop -o rw,server=,port=9000,rdbuffer=131072,allow_other -d -``` - -Change `` for the hostname of your Primary NameNode. Debug output will be printed to stderr, which -you will probably want to redirect to a file. Most FUSE-related problems can be tackled by reading through the stderr and looking for error messages. - -#### GridFTP #### - -#### Starting GridFTP in Standalone Mode #### - -If you would like to test the gridftp-hdfs server in a debug standalone mode, you can run the command: - -``` console -root@host # gridftp-hdfs-standalone -``` - -The standalone server runs on port 5002, handles a single GridFTP request, and will log output to stdout/stderr. - -### File Locations ### - - -| Component | File Type | Location | Needs editing? | -|-----------|---------------------------|-----------------------------------------------------------------------|-----------------------------------| -| Hadoop | Log files | `/var/log/hadoop/*` | No | -| | PID files | `/var/run/hadoop/*.pid` | No | -| | init scripts | `/etc/init.d/hadoop` | No | -| | init script config file | `/etc/sysconfig/hadoop` | Yes | -| | runtime config files | `/etc/hadoop/conf/*` | Maybe | -| | System binaries | `/usr/bin/hadoop` | No | -| | JARs | `/usr/lib/hadoop/*` | No | -| | runtime config files | `/etc/hosts_exclude` | Yes, must be present on NameNodes | -| | Log files | `/var/log/gridftp-auth.log`, `/var/log/gridftp.log` | No | -| GridFTP | Transfer log | `/var/log/gridftp.log` | No | -| | Authentication log | `/var/log/gridftp-auth.log` | No | -| | LCMAPS auth error log | `/var/log/messages` | No | -| | init.d script | `/etc/init.d/globus-gridftp-server` | No | -| | runtime config files | `/etc/gridftp-hdfs/*`, `/etc/sysconfig/gridftp-hdfs` | Maybe | -| | System binaries | `/usr/bin/gridftp-hdfs-standalone`, `/usr/sbin/globus-gridftp-server` | No | -| | System libraries | `/usr/lib64/libglobus_gridftp_server_hdfs.so*` | No | -| | LCMAPS VOMS configuration | `/etc/lcmaps.db` | Yes | -| | CA certificates | `/etc/grid-security/certificates/*` | No | - -### Known Issues ### - -#### Replicas #### - -You may need to change the following line in `/usr/share/gridftp-hdfs/gridftp-hdfs-environment`: - -``` file -export GRIDFTP_HDFS_REPLICAS=2 -``` - -#### copyFromLocal java IOException #### - -When trying to copy a local file into Hadoop you may come across the following java exception: - -
- Show detailed java exception -

-``` console -11/06/24 11:10:50 WARN hdfs.DFSClient: Error Recovery for block null bad datanode[0] -nodes == null -11/06/24 11:10:50 WARN hdfs.DFSClient: Could not get block locations. Source file -"/osg/ddd" - Aborting... -copyFromLocal: java.io.IOException: File /osg/ddd could only be replicated to 0 -nodes, instead of 1 -11/06/24 11:10:50 ERROR hdfs.DFSClient: Exception closing file /osg/ddd : -org.apache.hadoop.ipc.RemoteException: java.io.IOException: File /osg/ddd could only -be replicated to 0 nodes, instead of 1 - at -org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:1415) - at org.apache.hadoop.hdfs.server.namenode.NameNode.addBlock(NameNode.java:588) - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39) - at -sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) - at java.lang.reflect.Method.invoke(Method.java:597) - at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:528) - at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1319) - at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1315) - at java.security.AccessController.doPrivileged(Native Method) - at javax.security.auth.Subject.doAs(Subject.java:396) - at -org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1063) - at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1313) -``` -

-
- -This can occur if you try to install a DataNode on a machine with less than 10GB of disk space available. This can be changed by lowering the value of the following property in `/usr/lib/hadoop-0.20/conf/hdfs-site.xml`: - -``` file - - dfs.datanode.du.reserved - 10000000000 - -``` - -Hadoop always requires this amount of disk space to be available for non-hdfs usage on the machine. - -Getting Help ------------- - -To get assistance, please use the [this page](../common/help.md). - -References ----------- - -- [Using Hadoop as a Grid Storage Element](https://iopscience.iop.org/article/10.1088/1742-6596/180/1/012047), *Journal of Physics Conference Series, 2009*. -- [Hadoop Distributed File System for the Grid](http://osg-docdb.opensciencegrid.org/0009/000911/001/Hadoop.pdf), *IEEE Nuclear Science Symposium, 2009*. - -### Users ### - -This installation will create following users unless they are already created. - -| User | Comment | -|:------------|:--------------------------------------------------| -| `hadoop` | Runs the NameNode services | -| `hdfs` | Used by Hadoop to store data blocks and meta-data | -| `mapred` | | -| `zookeeper` | | - -For this package to function correctly, you will have to create the users needed for grid operation. Any user that can be authenticated should be created. diff --git a/docs/data/load-balanced-gridftp.md b/docs/data/load-balanced-gridftp.md deleted file mode 100644 index 3263f2e1e..000000000 --- a/docs/data/load-balanced-gridftp.md +++ /dev/null @@ -1,371 +0,0 @@ -title: Load Balancing GridFTP - -Load Balancing GridFTP -====================== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -GridFTP is designed for high throughput data transfers and in many cases can handle all of the transfers for a site. However, in some cases it may be useful to run multiple GridFTP servers to distribute the load. For such sites, we recommend using a [load balancer](https://en.wikipedia.org/wiki/Load_balancing_(computing)) to distribute requests and present the appearance of a single high-throughput GridFTP server. - -One general-purpose technology for implementing a load balancer on Linux is [Linux Virtual Server](http://www.linuxvirtualserver.org/whatis.html) (LVS). To use it with GridFTP, a single load balancer listens on a virtual IP address, monitors the health of the set of real GridFTP servers, and forwards requests to available ones. Optionally, there can be one or more inactive, backup load balancers that can activate and take over the virtual IP address in case the primary load balancer fails, resulting in a system that is more resilient to failure. LVS is implemented by the [IP Virtual Server](http://www.linuxvirtualserver.org/software/ipvs.html) kernel module, which can be managed by userspace services on the load balancers such as [keepalived](http://www.keepalived.org). - -This guide explains how to install, configure, run, test, and troubleshoot the `keepalived` service on a load balancing host for a set of [GridFTP](gridftp.md) servers. - -Before Starting ---------------- - -Before starting the installation process, consider the following requirements: - -- There must be a shared file system for file propagation across GridFTP servers -- You must have reserved a virtual IP address and associated virtual hostname - -As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: - -- Ensure the host has [a supported operating system](../release/supported_platforms.md) -- Obtain root access to each host - -Designing Your Load-Balanced GridFTP System -------------------------------------------- - -Before beginning the installation process, you will need to plan the overall architecture of your load-balanced GridFTP system: the number of GridFTP servers, the type of shared file system to run on the GridFTP servers, whether or not backup load balancers are required, and hardware requirements. - -### GridFTP servers - -The number of GridFTP servers that you should run is determined first and foremost by the expected GridFTP transfer load at your site and the speed of the links available to each server. For example, if you expect a 20Gbps peak transfer load and have 10Gb links with 80–90% efficiency, you would need a minimum of 4 GridFTP servers: 3 to satisfy your desired throughput + 1 for failover or growth. - -#### Shared file system - -The number of GridFTP servers can also be determined by your hardware needs and by your choice of shared file system. If you choose a POSIX-based shared file system, plan for machines with more cores, or more GridFTP hosts to distribute the CPU load. If you are running [GridFTP with Hadoop](install-hadoop.md#gridftp-configuration), plan for machines with more memory, or more GridFTP hosts to distribute the memory load. - -!!! note - If you determine that you need only a single GridFTP host, you do not need load balancing. Instead, follow the [standalone-GridFTP installation guide](gridftp.md). - -### Load balancer(s) - -In the recommended direct routing mode, load balancers simply rewrite initial packets from a given request so the hardware requirements are minimal. When choosing load balancer hosts, aim for stability. If your chosen host is unstable or if you do not want to introduce downtime for operating system or hardware updates, at least one additional load balancer will be needed as a backup. - -Preparing the GridFTP Servers ------------------------------ - -Before adding your GridFTP hosts to the load-balanced system, each host requires the following: - -* GridFTP software -* Special host certificates -* Load-balancing configuration - -### Acquiring host certificate(s) - -When authenticating with a GridFTP server, clients verify that the server's host certificate matches the hostname of the server. In the case of a load-balanced GridFTP system, clients contact the GridFTP server through the virtual hostname, so the GridFTP server will have to present a certificate containing the virtual hostname as well as the GridFTP server's hostname. Use the [OSG host certificate reference](../security/host-certs/overview.md) for more information on how to request these types of certificates. Additionally, a special procedure is available to acquire [Let's Encrypt certificates](#with-lets-encrypt) with the load balanced gridftp. - -If your GridFTP servers are also running XRootD, you will need unique certificates for each GridFTP server. Otherwise, you can request a single certificate that can be shared among the GridFTP servers. - -#### Without XRootD - -The single shared certificate must have the hostname associated with the load-balanced GridFTP system as its [common name](https://en.wikipedia.org/wiki/X.509_attribute_certificate) and each GridFTP servers hostname listed as [subject alternative names](https://en.wikipedia.org/wiki/Subject_Alternative_Name). - -1. Request and generate the shared certificate: - - :::console - user@host $ osg-cert-request --hostname \ - --country \ - --state \ - --locality \ - --organization - --altname \ - --altname - -1. Take the resulting CSR and get it signed by the appropriate authority. - Most institutions can use InCommon as outlined [here](../security/host-certs/incommon.md). -2. Create a directory to contain the shared certificate: - - :::console - root@host # mkdir /etc/grid-security/gridftp - -3. Place the shared certificate-key pair in the newly created directory: - - :::console - root@host # mv /etc/grid-security/gridftp/gridftp-hostcert.pem - root@host # mv /etc/grid-security/gridftp/gridftp-hostkey.pem - -1. Edit `/etc/sysconfig/globus-gridftp-server` to identify the shared certificate-key pair: - - export X509_USER_CERT=/etc/grid-security/gridftp/gridftp-hostcert.pem - export X509_USER_KEY=/etc/grid-security/gridftp/gridftp-hostkey.pem - -#### With XRootD - -XRootD requires that the certificate's [common name](https://en.wikipedia.org/wiki/X.509_attribute_certificate) refers specifically to the host it resides on. To ensure each GridFTP server can authenticate using the virtual hostname, add it as the [subject alternative name](https://en.wikipedia.org/wiki/Subject_Alternative_Name) for each certificate. - -1. Create a list of GridFTP server hostnames in `load-balanced-hosts.txt`: - - - - [...] - -1. Submit a batch request for the per-GridFTP server certificates: - - :::console - user@host $ osg-gridadmin-cert-request -f load-balanced-hosts.txt - -1. Copy the resulting certificates and keys to their corresponding GridFTP servers in `/etc/grid-security/hostcert.pem` and `/etc/grid-security/hostkey.pem`, respectively. - -#### With Let's Encrypt - -The certificate provided to the clients needs to have the virtual host address of the load balancer, as well as the hostname of each of the worker nodes. Additionally, LetsEncrypt contacts the requested domains to verify ownership. Therefore, each domain requested must be available to respond to HTTP requests at the same time. The easiest method for this is to use a shared directory for Let's Encrypt's `certbot` to install the secrets. - -The procedure to acquire Let's Encrypt certificates for multiple hosts is as follows: - -1. Create or use a shared directory that each of the data transfer nodes can read, for example a simple NFS share. The steps in creating a NFS shared directory is outside the scope of this guide. In this guide, the shared directory will be referred as `/mnt/nfsshare` . - -2. Install `httpd` on each of the data transfer nodes: - - :::console - root@host $ yum install httpd - - Create a webroot directory within the shared directory on one of the nodes: - - :::console - root@host $ mkdir /mnt/nfsshare/webroot - -3. Configure `httpd` to export the same webroot on each of the data transfer nodes: - - - DocumentRoot "/mnt/nfsshare/webroot" - - Require all granted - - - - -4. Configure `keepalived` to virtualize port 80 to at least one of your data transfer nodes. - Add to your configuration: - - - virtual_server 80 { - delay_loop 10 - lb_algo wlc - lb_kind DR - protocol tcp - - real_server { - TCP_CHECK { - connect_timeout 3 - connect_port 80 - } - } - } - - -5. Run `certbot` with the webroot options on only 1 of the data nodes. The first domain in the command line should be the virtual hostname: - - root@host $ certbot certonly -w /mnt/nfsshare/webroot -d -d -d ... - -For XRootD certificates, the real hostname of the XRootD node is required to be the first hostname in the `certbot` command. You may run the `certbot` command several times on the same host, replacing the `VIRTUAL_HOSTNAME` with the real hostname of the XRootD servers, and placing the `VIRTUAL_HOSTNAME` in in the list of other domains in the certificate. - -### Installing GridFTP - -Whether you are starting from scratch or adding more GridFTP servers to your load-balanced GridFTP system, follow the documentation for [installing a standalone GridFTP server](gridftp.md) for each of your intended GridFTP servers (skip section 2.2, requesting a certificate). For hosts with GridFTP already installed, skip this section. - -### Configuring your GridFTP servers - -Each GridFTP server requires changes to its IP configuration and potentially its arptables: - -- [Adding your virtual IP address](#adding-your-virtual-ip-address) -- [Disabling ARP](#disabling-arp) − if your GridFTP servers are on the same network segment as the virtual IP - -#### Adding your virtual IP address - -Use the virtual IP address of your load balancer(s) as the secondary IPs of each of your GridFTP servers. - -1. Add the virtual IP using the `ip` tool: - - :::console - root@host # ip addr add / dev - -1. To persist the virtual IP changes across reboots, edit `/etc/rc.d/rc.local`, and add the same command as used above. -1. Make sure that `/etc/rc.d/rc.local` is executable: - - :::console - root@host # chmod u+x /etc/rc.d/rc.local - -#### Disabling ARP - -If your GridFTP servers and load balancer(s) are on the same network segment, you will have to disable ARP on the GridFTP servers to avoid [ARP race conditions](http://kb.linuxvirtualserver.org/wiki/ARP_Issues_in_LVS/DR_and_LVS/TUN_Clusters). Otherwise, skip to [the section on preparing keepalived](#preparing-keepalived-load-balancers). - -1. Install the arptables software: - - :::console - root@host # yum install arptables - -1. Disable ARP: - - :::console - root@host # arptables -F - root@host # arptables -A IN -d -j DROP - root@host # arptables -A OUT -s -j mangle --mangle-ip-s - -1. Save ARP tables to survive reboots: - - :::console - root@host # arptables-save > /etc/sysconfig/arptables - -Preparing Keepalived Load Balancer(s) -------------------------------------- - -### Installing Keepalived - -Whether you run a single load balancer, or have one active load balancer and some inactive backups, each load balancer host must have the `keepalived` software installed, configured, and running. - -!!! note - Do not install `keepalived` on the GridFTP servers themselves. - -The `keepalived` package is available from standard operating system repositories. Install it on each load balancer host using the following commands: - -1. Clean yum cache: - - ::console - root@host # yum clean all --enablerepo=* - -1. Update software: - - :::console - root@host # yum update - - This command will update **all** packages - -1. Install the `keepalived` package: - - root@host # yum install keepalived - -### Required configuration - -On the primary load balancer, edit `/etc/keepalived/keepalived.conf`: - -``` file -global_defs { - router_id -} - -vrrp_instance VI_gridftp { - state MASTER - interface - virtual_router_id - priority 100 - virtual_ipaddress { - / dev - } -} - -virtual_server 2811 { - delay_loop 10 - lb_algo wlc - lb_kind DR - protocol tcp - - real_server { - TCP_CHECK { - connect_timeout 3 - connect_port 2811 - } - } - real_server { - TCP_CHECK { - connect_timeout 3 - connect_port 2811 - } - } - [...] -} -``` - -!!! note - Use the same `VIRTUAL-IP-ADDRESS` throughout the configuration of your load-balanced GridFTP system. - -!!! note - In the `virtual_server` section, write one `real_server` subsection for each GridFTP server behind the load balancer. - -### Optional configuration - -The following configuration steps are optional and will likely not be required for setting up a small cluster of GridFTP hosts. If you do not need any of the following special configurations, skip to [the section on using keepalived](#using-keepalived). - -- [Adding backup load balancers](#adding-backup-load-balancers) -- [Enabling e-mail notifications](#enabling-e-mail-notifications) - -#### Adding backup load balancers - -If you need to add backup load balancers, copy `/etc/keepalived/keepalived.conf` from your primary load balancer and change the `state` and `priority` attributes under your `vrrp_instance VI_gridftp` section: - -!!! note - Priority specifies the order of preferred load balancer fallback, where larger values corresponds to a higher preference. - -``` file -vrrp_instance VI_gridftp { - state BACKUP - interface - virtual_router_id - priority - virtual_ipaddress { - / dev - } -} -``` - -#### Enabling e-mail notifications - -To receive e-mails when the state of your load-balanced system changes, update the `global_defs` section of `/etc/keepalived/keepalived.conf` for each of your load balancer nodes: - -``` file -notification_email { - - -[...] -} -notification_email_from -smtp_server -smtp_connect_timeout 60 -router_id -``` - -Using Your Load Balanced GridFTP System ---------------------------------------- - -### Using GridFTP - -On the GridFTP servers, arptables is the only additional service required for running a load-balanced GridFTP system. -Manage the service with the following commands: - -| **To ...** | **Run the command...** | -| -------- | ----------------------------- | -| Start the service | `systemctl start arptables` | -| Stop theservice | `systemctl stop arptables` | -| Enable the service to start during boot | `systemctl enable arptables` | -| Disable the service from starting during boot | `systemctl disable arptables` | - -For information on how to use your individual GridFTP servers, please refer to the [Managing GridFTP section](gridftp.md#managing-gridftp) of the GridFTP installation guide. - -### Using Keepalived - -On the load balancer nodes, `keepalived` is the only additional service required for running a load-balanced GridFTP system. As a reminder, here are common service commands (all run as `root`): - -| To... | Run the command... | -|:--------------------------------------------|:-------------------------------| -| Start a service | `systemctl start keepalived` | -| Stop a service | `systemctl stop keepalived` | -| Enable a service to start during boot | `systemctl enable keepalived` | -| Disable a service from starting during boot | `systemctl disable keepalived` | - -Getting Help ------------- - -To get assistance with `keepalived` in front of OSG Software services, please use the [this page](../common/help.md). - ---------- - -- [Linux Virtual Server homepage](http://www.linuxvirtualserver.org/whatis.html) -- [Keepalived homepage](http://www.keepalived.org/index.html) -- [RHEL 7 Load Balancer Administration Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/load_balancer_administration/index) -- [T2 Nebraska LVS installation notes](https://github.com/gattebury/gridftp-with-lvs) - - diff --git a/docs/monitoring/advanced-rsv-configuration.md b/docs/monitoring/advanced-rsv-configuration.md deleted file mode 100644 index 7e965e566..000000000 --- a/docs/monitoring/advanced-rsv-configuration.md +++ /dev/null @@ -1,190 +0,0 @@ -Advanced RSV Configuration -========================== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - - -About This Document -------------------- - -Most site administrators will be able to configure RSV by editing `/etc/osg/config.d/30-rsv.ini` and running osg-configure as described in [the RSV installation document](install-rsv.md#configuring-rsv). This document provides instructions for configuration beyond what osg-configure is able to do. - - -Configuring metrics -------------------- - -If you need to change the behavior of a metric you can edit the metric configuration files. These replace the spec files from previous versions of RSV. - -- `/etc/rsv/metrics` - changes made to conf file in this directory named after a metric will affect the metric when run against all hosts -- `/etc/rsv/metrics/` - changes made to conf files in this directory (named as the host FQDN) will affect the metric when run against the specific host - -The configuration files are in INI format and have two sections: - -- a first one named after the metric with execution options -- a second one with the name including the "args" keyword, including parameters sent to the probe at invokation - -### Changing the times a metric runs - -To change the time a metric runs set the `cron-interval` setting in the metric's conf file. Use `man 5 crontab` for a description of the format. For example, to change the `org.osg.general.ping-host` to run at a different time: - -``` dosini -[org.osg.general.ping-host] -cron-interval = 45 * * * * - -[org.osg.general.ping-host args] -#ping-count = -#ping-timeout = -``` - -!!! note - Be sure to put the `cron-interval` setting in the `[org.osg.general.ping-host]` section, and not the `[org.osg.general.ping-host args]` section! The purpose of the "args" section is described in the "passing extra parameters to a metric" section below. - -After modifying the cron time of a metric you must restart RSV for the change to take effect. - -To see what times each of the metrics is running you can use `rsv-control` as follows: - -``` console -root@host# rsv-control -l --cron-times - -Metrics enabled for host: osg-edu.cs.wisc.edu:10443 | Cron times -----------------------------------------------------+-------------------- -org.osg.srm.srmcp-readwrite | 28 * * * * -org.osg.srm.srmping | 13,33,53 * * * * -... -``` - -### Passing extra parameters to a metric - -Any `key=value` pairs in the "args" section of the metric's *conf* file will be turned into command line parameters to the probe. For example, for this file: - -``` dosini -[org.osg.certificates.cacert-expiry args] -warning-hours = 6 -error-hours = 12 -``` - -This would lead to the probe getting called with the command-line parameters `--warning-hours 6 --error-hours 12`. - -Configure consumers -------------------- - -There is a configuration file common to all consumers: `/etc/rsv/consumers.conf`. It is a file in INI format and possible entries are: - -| Setting | Values | Details | -|:----------|:------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| enabled | <consumers> | Comma-separated list of consumers to be enabled | -| timestamp | local | If this is set to local, a record with a local timestamp will be supplied to the consumer. If this is set to any other value, or is not set, a record with the GMT will be created. | - -Each consumer has a configuration file in `/etc/rsv/consumers` named after it. This allows to specify command lines and environment for the consumers. Some consumers may have also their own configuration file, usually in `/etc/rsv/`. Below is an example for the Nagios consumer. - - -### Sending RSV records to Nagios - -1. Edit your `/etc/rsv/rsv-nagios.conf` file and fill in the appropriate information. The path of the configuration file is specified in `/etc/rsv/consumers/nagios-consumer.conf`. -2. If your Nagios config file contains password information you will want to lock down the permissions. Here is a suggested way to do this (replace `` with the group of your RSV user (`rsvuser` by default)): - - :::console - root@host# chown root: /etc/rsv/rsv-nagios.conf - root@host# chmod 0440 /etc/rsv/rsv-nagios.conf - -3. In the configuration file at `/etc/rsv/consumers/nagios-consumer.conf`, check the following two settings: - - Make sure that the path to your config file is correct. It may be referencing a directory `config` instead of `etc` - - If you want to use `rsv2nsca` add the string "--send-nsca" to the `args` line. -4. Enable and start the Nagios consumer by editing `consumers.conf` or by using `rsv-control` as follows: - - :::console - root@host# rsv-control --enable nagios-consumer - - The Nagios consumer will be started the next time that you start RSV. If you are already running RSV you can turn on the Nagios consumer immediately by running: - - :::console - root@host# rsv-control --on nagios-consumer - -5. To verify that the Nagios consumer is running you can run `rsv-control -j`. -6. The log information for the Nagios consumer can be found in these files: - - `/var/log/rsv/consumers/nagios-consumer.log` - - `/var/log/rsv/consumers/nagios-consumer.out` - - `/var/log/rsv/consumers/nagios-consumer.err` - -General RSV configuration options ---------------------------------- - -You can configure the RSV framework using `/etc/rsv/rsv.conf`. It is a file in INI format and possible entries are: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SettingValuesDetails
user<username>The UNIX username that owns RSV. This is mandatory
service-cert<path>Absolute path to the service certificate file. If this is set service-key and service-proxy must also be set.
service-key<path>Absolute path to the service key file. This must be used with service-cert.
service-proxy<path>Absolute path where the service proxy will be generated. This must be used with service-cert.
proxy-file<path>Alternative to service-cert. The absolute path where the user proxy file is located. This will not be auto-regenerated.
details-data-trim-length<integer>The number of bytes to trim the detailsData section to. If set to 0 no trimming will occur.
job-timeout<integer>Time in seconds before a metric is killed. A metric that times out will return a CRITICAL status.
- -Troubleshooting ---------------- - -### Important files locations - - -Configuration files: - -| File Description | Location | Comment | -|:--------------------------------------------|:------------------------------------------------|:-------------------------------------------------------------| -| RSV configuration directory | `/etc/rsv` | | -| RSV configuration | `/etc/rsv/rsv.conf` | RSV framework configuration | -| Consumers configuration in RSV | `/etc/rsv/consumers.conf` | Select the consumers and change generic options | -| Consumers configuration | `/etc/rsv/consumers/` | To change arguments and environment | -| Generic metrics configuration | `/etc/rsv/metrics/.conf` | To change arguments and environment | -| Host specific metrics configuration | `/etc/rsv/metrics//.conf` | To change arguments and environment when running on HOSTNAME | - -Other files: - -| File Description | Location | Comment | -|:----------------------|:-------------------------------|:------------------------| -| Metric log files | `/var/log/rsv/metrics` | | -| Consumer log files | `/var/log/rsv/consumers` | | -| Initial configuration | `/etc/osg/config.d/30-rsv.ini` | Read by `osg-configure` | -| Web files output | `/usr/share/rsv/www/` | | - -To find the metrics and the other files in RSV you can use also the RPM commands: `rpm -ql rsv-metrics` and `rpm -ql rsv`. - diff --git a/docs/monitoring/install-rsv-gwms-tester.md b/docs/monitoring/install-rsv-gwms-tester.md deleted file mode 100644 index 2bd8f6047..000000000 --- a/docs/monitoring/install-rsv-gwms-tester.md +++ /dev/null @@ -1,220 +0,0 @@ -Installing and Using the RSV GlideinWMS Tester -============================================== - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - - -About This Guide ----------------- - -The RSV GlideinWMS Tester (or *Tester*, in this document) is a tool that a VO front-end administrator can use to test remote sites for the ability to run the VO’s jobs. It is particularly useful when setting up a VO for the first time or when changing the sites at which a VO’s jobs can run. For a site to pass the test, it must successfully run a simple test job via the normal GlideinWMS mechanisms, in much the same way as a real VO job. - -Use this page to learn how to install, configure, and use the Tester for your VO front-end. - -Before Starting ---------------- - -Before starting the installation process, consider the following points (consulting [the Reference section below](#reference) as needed): - -- **Software:** You must have [a GlideinWMS Front-end](../other/install-gwms-frontend.md) installed -- **Configuration:** The GlideinWMS Front-end must be configured (a) [to have at least one group that matches pilots to sites using DESIRED\_SITES](../other/install-gwms-frontend.md#allowing-users-to-specify-where-their-jobs-run), and (b) [to support the is_itb user job attribute](../other/install-gwms-frontend.md#creating-a-group-for-testing-configuration-changes) -- **Host choice:** The Tester should be installed on its own host; a small Virtual Machine (VM) is ideal -- **Service certificate:** The Tester requires a host certificate at `/etc/grid-security/hostcert.pem` and an accompanying key at `/etc/grid-security/hostkey.pem` -- **Network ports:** Test jobs must be able to contact the tester using the HTCondor Shared Port on port 9615 (TCP), and you must be able to contact a web server on port 80 (TCP) to view test results. - - - -Installing the Tester ---------------------- - -The Tester software takes advantage of several other OSG software components, so the installation will also include OSG’s site validation system (RSV), HTCondor, and the GlideinWMS pilot submission software. - -```console -root@host # yum install rsv-gwms-tester -``` - -Configuring the Tester ----------------------- - -Before you use the Tester, there are some one-time configuration steps to complete, one set on your GlideinWMS Front-end Central Manager host and one set on the Tester host. - -### Configuring the GlideinWMS Front-end Central Manager - -Complete these steps **on your GlideinWMS Front-end Central Manager host**: - -1. Authorize the Tester host to connect to your Central Manager: - - :::console hl_lines="1" - root@host # glidecondor_addDN -allow-others -daemon condor - - - Where `COMMENT` is a human-readable label for the Tester host (e.g., “RSV GWMS Tester at myhost”), and `TESTER_DN` is the Distinguished Name (DN) of the host certificate of your Tester host. Most likely, you will need to quote both of these values to protect them from the shell. For example: - - :::console - root@host # glidecondor_addDN -allow-others -daemon 'RSV GWMS Tester on Fermicloud' '/DC=com/DC=DigiCert-Grid/O=Open Science Grid/OU=Services/CN=fermicloud357.fnal.gov' condor - -2. Restart HTCondor to apply the changes - - On **EL 6** systems: - - :::console - root@host # service condor restart - - On **EL 7** systems: - - :::console - root@host # systemctl restart condor - -3. Add the new Tester to your GlideinWMS front-end configuration. - Edit the file `/etc/gwms-frontend/frontend.xml` and add a line as follows within the `` element - - :::file hl_lines="1" - - - Where `TESTER_DN` is the Distinguished Name (DN) of the host certificate of your Tester host (as above), and `TESTER_HOSTNAME` is the fully qualified hostname of the Tester host. For example: - - :::file - - - Reconfigure your GlideinWMS front-end to apply the changes: - - :::console - root@host # service gwms-frontend reconfig - -### Configuring the Tester host - -Complete the following steps **on your Tester host**: - -1. Configure the Tester for the VOs that your Front-end supports - - Edit the file `/etc/rsv/metrics/org.osg.local-gfactory-site-querying-local.conf`. The `constraint` line is an HTCondor ClassAd expression containing one `stringListMember` function per VO that your Front-end supports. If there is more than one VO, the function invocations are joined by the “logical or” operator, `||`. Edit the `constraint` line for your Front-end. - - For example, for a single VO named `Foo`, the line would be: - - :::file - constraint = stringListMember("Foo", GLIDEIN_Supported_VOs) - - For two VOs named `Foo` and `Bar`, the line would be: - - :::file - constraint = stringListMember("Foo", GLIDEIN_Supported_VOs) || stringListMember("Bar", GLIDEIN_Supported_VOs) - - Do not change the other settings in this file, unless you have clear and specific reasons to do so. - -2. Authorize the central manager of your Front-end to connect to the tester host: - - :::console hl_lines="1" - root@host # glidecondor_addDN -allow-others -daemon condor - - Where `COMMENT` is a human-readable identifier for the Central Manager, and `CENTRAL_MGR` is the Distinguished Name (DN) of the host certificate of your GlideinWMS Front-end’s Central Manager host. Most likely, you will need to quote both of these values to protect them from the shell. For example: - - :::console - root@host # glidecondor_addDN -allow-others -daemon 'UCSD central manager DN' '/DC=org/DC=opensciencegrid/O=Open Science Grid/OU=Services/CN=osg-ligo-1.t2.ucsd.edu' condor - -3. Configure the special HTCondor-RSV instance with your host IP address. - - Create the file `/etc/condor/config.d/98_public_interface.config` with this content: - - :::file hl_lines="1 2" - NETWORK_INTERFACE =
- CONDOR_HOST = - - Where `ADDRESS` is the IP address of your Tester host, and `CENTRAL_MGR` is the hostname of your GlideinWMS Front-end Central Manager. - -4. Enable the Tester’s RSV probe: - - :::console - root@host # rsv-control --enable org.osg.local-gfactory-site-querying-local --host localhost - -Using the Tester ----------------- - -There are at least two aspects of using the Tester: - -- Managing the services that are associated with the Tester software -- Viewing results from the Tester - -### Managing Tester services - -Because the Tester is built on other OSG software, there are a number of services in your installation. The specific services are: - - -| Software | Service name | Notes | -|:-------------------|:--------------|:---------------------------| -| Apache HTTP Server | `httpd` | Web server for results | -| HTCondor-Cron | `condor-cron` | cron-like jobs in HTCondor | -| RSV | `rsv` | OSG site validator | - - -### Viewing Tester results - -Once the Tester RSV probe is enabled and active, and the services listed above have been started, there are two kinds of RSV probes that run periodically: - -- One probe asks the GlideinWMS factory for the up-to-date list of sites supported by your VO(s) — runs every 30 minutes -- One probe submits and monitors one test job to each site supported by your VO(s) — run every 60 minutes - -You can view the latest results of both probe types on an RSV results web page, or you can manually run the first probe to see the full list of sites. - -#### Viewing RSV results online - -To see the latest results, access `https://` (where `HOSTNAME` is the name of your Tester host). - -- There should be one result row per site supported by your VO(s), using the “org.osg.general.dummy-vanilla-probe” probe (aka *metric*) -- There should be exactly one result row for the probe that fetches the list of sites, which is the “org.osg.local-gfactory-site-querying-local” probe (aka *metric*) -- There is a legend for the background colors at the end of the page - -Ideally, each site supported by your VO(s) should be shown with a green background, which indicates that a Tester job ran at that site recently and successfully. There may be transient failures but if you notice a site in the failed state over multiple days, contact OSG Factory Operations () about the failing site, including a link to your Tester RSV results page. - -To see detailed information from each probe, click on the probe name in the Metric column. - -To see the list of sites that are supported by your VO(s) and are being tested, click the “org.osg.local-gfactory-site-querying-local” link at the bottom of the list of probes. You can also run the probe manually, as described next. - -### Listing supported sites manually - -To manually run the probe that fetches the list of sites supported by your VO(s), run the following command on your Tester host: - -```console -root@host # rsv-control --run org.osg.local-gfactory-site-querying-local --host localhost -``` - -The probe produces many lines of output, some of which are just about the probe execution itself. But look for lines like this: - -```console -MSG: Updating configuration for host -``` - -Where `` is the name of the site, and there should be one such line per site supported by your VO(s). - -Troubleshooting RSV-GWMS-Tester -------------------------------- - -You can find more information on troubleshooting in the [RSV troubleshooting section](../monitoring/install-rsv.md#troubleshooting-rsv) - -Logs and configuration: - -| File Description | Location | Comment | -|:----------------------|:-----------------------|:--------| -| Condor Cron log files | `/var/log/condor-cron` | | - -| File Description | Location | Comment | -|:---------------------|:-------------------------------------------------------------------|:------------------------------------| -| Metric configuration | `/etc/rsv/metrics/org.osg.local-gfactory-site-querying-local.conf` | To change arguments and environment | - -Getting Help ------------- - -To get assistance, please use the [this page](../common/help.md). - -Reference ---------- - -### Certificates - -| Certificate | User that owns certificate | Path to certificate | -|:-----------------|:---------------------------|:----------------------------------| -| Host certificate | `root` | `/etc/grid-security/hostcert.pem` | -| Host key | `root` | `/etc/grid-security/hostkey.pem` | - -Find instructions to request a host certificate [here](../security/host-certs.md). diff --git a/docs/monitoring/install-rsv.md b/docs/monitoring/install-rsv.md deleted file mode 100644 index 8297d7cd9..000000000 --- a/docs/monitoring/install-rsv.md +++ /dev/null @@ -1,358 +0,0 @@ -Installing, Configuring, Using, and Troubleshooting RSV -======================================================= - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -About This Guide ----------------- - -The Resource and Service Validation (RSV) software helps a site administrator verify that certain site resources and services are working as expected. OSG recommends that sites install and run RSV, but it is optional; further, each site selects which specific tests (called *probes*) to run. - -Use this page to learn more about RSV in general, and how to install, configure, run, test, and troubleshoot RSV from the OSG software repositories. For documentation on specific probes or on how to write your own probes, please check the [Reference section](#reference). - -Introduction to RSV -------------------- - -The Resource and Service Validation (RSV) software provides OSG site administrators a scalable and easy-to-maintain -resource and service monitoring infrastructure. -It consists of client tools that allow a site administrator to run tests against their site by providing a set of tests -(which can run on the same or other hosts within a site), HTCondor-Cron for scheduling, and tools for collecting and -storing the results (using Gratia). -The client package is not installed by default and may be installed on a CE or other host. -Generally, you configure the RSV client to run tests at scheduled time intervals and then it makes results available on -a local website. Also, the client can upload test results to a central collector (see next item). - - -Before Starting ---------------- - -Before starting the installation process, consider the following points (consulting [the Reference section below](#reference) as needed): - -- **User IDs:** If they do not exist already, the installation will create the Linux user IDs `rsv` and `cndrcron` -- **Service certificate:** The RSV service requires a service certificate (`/etc/grid-security/rsv/rsvcert.pem`) and matching key (`/etc/grid-security/rsv/rsvkey.pem`) -- **Network ports:** To view results, port 80 must accept incoming requests; outbound connectivity to tested services must work, too -- **Host choice:** Install RSV on your site CE unless you have specific reasons (e.g., performance) for installing on a separate host - -As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: - -- Ensure the RSV host has [a supported operating system](../common/yum.md) -- Obtain root access to the host -- Prepare [the required Yum repositories](../common/yum.md) -- Install [CA certificates](../common/ca.md) - -Installing RSV --------------- - -An installation of RSV at a site consists of the RSV client software, the Apache web server, parts of HTCondor (for its cron-like scheduling capabilities), and various other small tools. To simplify installation, OSG provides a convenience RPM that installs all required software with a single command. - -1. Consider updating your local cache of Yum repository data and your existing RPM packages: - - :::console - root@host # yum clean all --enablerepo=\* - root@host # yum update - - !!! note - The `update` command will update **all** packages on your system. - -2. If you have installed HTCondor already but not by RPM, install a special empty RPM to make RSV happy: - - :::console - root@host # yum install empty-condor --enablerepo=osg-empty - -3. Install RSV and related software: - - :::console - root@host # yum install rsv - -Configuring RSV ---------------- - -After installation, there are some one-time configuration steps to tell RSV how to operate at your site. - -1. Edit `/etc/osg/config.d/30-rsv.ini` and follow the instructions in the file. There are detailed comments for each setting. In the simplest case — to monitor only your CE — set the `htcondor_ce_hosts` variable to the fully qualified hostname of your CE. - -2. If you have installed HTCondor already but not by RPM, specify the location of the Condor installation in `30-rsv.ini` in the `condor_location` setting. If an HTCondor RPM is installed, you do not need to set `condor_location`. - -3. Complete the configuration using the `osg-configure` tool: - - :::console - root@host # osg-configure -v - root@host # osg-configure -c - -### Optional configuration - -The following configuration steps are optional and will likely not be required for setting up a small or typical site. If you do not need any of the following special configurations, skip to [the section on using RSV](#using-rsv). - -Generally speaking, read the [ConfigureRsv](advanced-rsv-configuration.md) page for more advanced configuration options. - -#### Configuring RSV to run probes using a remote server - -RSV monitors systems by running probes, which can run on the RSV host itself (the default case), via a separate batch system like HTCondor, or via a remote batch system using a Globus gatekeeper and its job manager. The last two options both can count those jobs and report them to, for example, Gratia. - -In this case, remember to: - -- Add the RSV user `rsv` on all the systems where the probes may run, and -- Map the RSV service certificate to the user you intend to use for RSV. This should be a local user used exclusively for RSV and not belonging to an institutional VO to avoid for the RSV probes to be accounted as regular VO jobs in Gratia. -This can be done in the configuration of the [LCMAPS VOMS plugin](../security/lcmaps-voms-authentication.md) on your CE. - -#### Configuring the RSV web server to use HTTPS instead of HTTP - -If you would like your local RSV web server to use HTTPS instead of the default HTTP (for compatibility or security reasons), complete the steps below. This procedure assumes that you already have an HTTP service certificate (or a copy of the host certificate) in `/etc/grid-security/http/`. If not, omit the `SSLCertificate*` modifications below, and your web server will start with its own, self-signed certificate. - -1. Install `mod_ssl`: - - :::console - root@host # yum install mod_ssl - -2. Make an alternate set of HTTP service certificate files: - - :::console - root@host # cp -p /etc/grid-security/http/httpcert.pem /etc/grid-security/http/httpcert2.pem - root@host # cp -p /etc/grid-security/http/httpkey.pem /etc/grid-security/http/httpkey2.pem - root@host # chown apache:apache /etc/grid-security/http/http*2.pem - -3. Back up existing Apache configuration files: - - :::console - root@host # cp -p /etc/httpd/conf/httpd.conf /etc/httpd/conf/httpd.conf.orig - root@host # cp -p /etc/httpd/conf.d/ssl.conf /etc/httpd/conf.d/ssl.conf.orig - -4. Change the default port for HTTP connections to 8000 by editing `/etc/httpd/conf/httpd.conf` - - :::file - Listen 8000 - - -5. Set up HTTPS access by editing `/etc/httpd/conf.d/ssl.conf`: - - :::file - Listen 8443 - - SSLCertificateFile /etc/grid-security/http/httpcert2.pem - SSLCertificateKeyFile /etc/grid-security/http/httpkey2.pem - - After these changes, when you start the Apache service, it will listening on ports `8000` (for HTTP) and `8443` (for HTTPS), rather than the default port `80` (for HTTP only). - - !!! warning - if you make the changes above, you must restart the Apache server after each CA certificate update to pick up the changes. - - -Using RSV ---------- - -### Managing RSV and associated services - -In addition to the RSV service itself, there are a number of supporting services in your installation. The specific services are: - -| Software | Service name | Notes | -|:--------------|:-----------------------------------------------|------------------------ | -| Fetch CRL | `fetch-crl-boot` and `fetch-crl-cron` | See [CA documentation](../common/ca.md#managing-certificate-revocation-lists)| -| Apache | httpd | | -| HTCondor-Cron | condor-cron | | -| RSV | rsv | | - -Start the services in the order listed and stop them in reverse order. As a reminder, here are common service commands (all run as `root`): - - -| To … | Run the command … | -|:--------------------------------------------|:--------------------------------------| -| Start a service | `service start` | -| Stop a service | `service stop` | -| Enable a service to start during boot | `chkconfig on` | -| Disable a service from starting during boot | `chkconfig off` | - -Where `` is the name of the service on the table above. - -### Running RSV manually - -Normally, the HTCondor-Cron scheduler runs RSV periodically. However, you can run RSV probes manually at any time: - -``` console -root@host # rsv-control --run --all-enabled -``` - -If successful, results will be available from your local RSV web server (e.g., `http://localhost/rsv`) and, if enabled (which is the default) on [Topology](https://topology.opensciencegrid.org/). - -You can also run the metrics individually or pass special parameters as explained in the [rsv-control document](rsv-control.md). - -Troubleshooting RSV -------------------- - -To get assistance, use the [help procedure](../common/help.md). - -RSV has a tool to collect information useful for troubleshooting into a tarball that can be shared with the developers and support staff. -To use it: - -``` console -root@host# rsv-control --profile -Running the rsv-profiler... -OSG-RSV Profiler -Analyzing... -Making tarball (rsv-profiler.tar.gz) -``` - -You can find more information on troubleshooting RSV in the [rsv-control documentation](rsv-control.md). - -!!! note - If you are getting assistance via the trouble ticket system, you must add a `.txt` extension to the tarball so it can be uploaded: - -### Failed to send via Gratia - -If you see `Failed to send record Failed to send via Gratia: Server unable to receive data:` in `/var/log/rsv/consumers/gratia-consumer.output` you should process to disable the gratia consumer using the following commands - -```console -root@host# rsv-control --disable --host gratia-consumer -root@host# rsv-control --off --host gratia-consumer -``` - -Change `` for the hostname of the server where you are installing RSV. - -### Important file locations - -Logs and configuration: - -| File Description | Location | Comment | -|:-------------------|:-------------------------|:------------------------------------------------| -| Metric log files | `/var/log/rsv/metrics` | | -| Consumer log files | `/var/log/rsv/consumers` | | -| HTML files | `/usr/share/rsv/www/` | Available at `http://your.host.example.com/rsv` | - -| File Description | Location | Comment | -|:----------------------|:--------------------------------------------|:----------------------------------------------------------------------------------------------| -| Initial configuration | `/etc/osg/config.d/30-rsv.ini` | Read by `osg-configure` | -| RSV configuration | `/etc/rsv` | Generally files in this directory should not be edited directly. Use `osg-configure` instead. | -| Metric configuration | `/etc/rsv/metrics/HOSTNAME/METRICNAME.conf` | To change arguments and environment | - -To find the metrics and the other files in RSV you can use also the RPM commands: `rpm -ql rsv-metrics` and `rpm -ql rsv`. - -### Getting more information from rsv-control - -The first step to getting more information is to run rsv-control with more verbosity. Use the `--verbose` (`-v`) flag. This flag can be used with any of rsv-control's abilities (run, enable, list, etc). The verbosity levels are: - -- 0 = print nothing -- 1 = print warnings and errors along with usual output of command being run (1 is the default level) -- 2 = adds informational messages -- 3 = full debugging output - -For example, here is the output when running a metric with -v2. - -
- Show detailed ouput -```console - [root@fermicloud016 condor]# rsv-control -r org.osg.general.osg-version -v 2 -u osg-edu.cs.wisc.edu - INFO: Reading configuration file /etc/rsv/rsv.conf - INFO: Reading configuration file /etc/rsv/consumers.conf - INFO: Validating configuration: - INFO: Validating user: -INFO: Invoked as root. Switching to 'rsv' user (uid: 100 - gid: 102) -INFO: Registered consumers: html-consumer, gratia-consumer -INFO: Loading config file '/etc/rsv/meta/metrics/org.osg.general.osg-version.meta' -INFO: Loading config file '/etc/rsv/metrics/org.osg.general.osg-version.conf' -INFO: Optional config file '/etc/rsv/metrics/osg-edu.cs.wisc.edu/org.osg.general.osg-version.conf' does not exist -INFO: Checking proxy: -INFO: Using service certificate proxy -INFO: Running command with timeout (1200 seconds): - /usr/bin/openssl x509 -in /tmp/rsvproxy -noout -enddate -checkend 21600 -INFO: Exit code of job: 0 -INFO: Service certificate valid for at least 6 hours. -INFO: Pinging host osg-edu.cs.wisc.edu: -INFO: Running command with timeout (1200 seconds): - /bin/ping -W 3 -c 1 osg-edu.cs.wisc.edu -INFO: Exit code of job: 0 -INFO: Ping successful - -Running metric org.osg.general.osg-version: - -INFO: Executing job remotely using Condor-G -INFO: Setting up job environment: -INFO: No environment setup declared -INFO: Condor-G working directory: /var/tmp/rsv/condor_g-JiQthF -INFO: Forming arguments: -INFO: Arguments: '' -INFO: List of files to transfer: /usr/libexec/rsv/probes/RSVMetric.pm -INFO: Condor submission: Submitting job(s). -1 job(s) submitted to cluster 2. -INFO: Trimming data to 10000 bytes because details-data-trim-length is set -INFO: Creating record for html-consumer consumer at '/var/spool/rsv/html-consumer/org.osg.general.osg-version.7rgLfn' -INFO: Creating record for gratia-consumer consumer at '/var/spool/rsv/gratia-consumer/org.osg.general.osg-version.-qelnL' -INFO: Result: - -metricName: org.osg.general.osg-version -metricType: status -timestamp: 2012-01-25 16:12:40 CST -metricStatus: OK -serviceType: OSG-CE -serviceURI: osg-edu.cs.wisc.edu -gatheredAt: fermicloud016.fnal.gov -summaryData: OK -detailsData: OSG 1.2.26 - -EOT -``` -
- -Getting Help ------------- - -To get assistance, please use [this page](../common/help.md) and attach the output of `rsv-control --profile`: - -```console -root@host # rsv-control --profile -Running the rsv-profiler... -OSG-RSV Profiler -Analyzing... -Making tarball (rsv-profiler.tar.gz) -``` - -Reference ---------- - -### Users - -The RSV installation will create two users unless they are already created. The users are created when the `rsv` and `condor-cron` packages are installed. - -| User | Comment | -|:-----------|:-----------------------------------------------------------------------------------| -| `rsv` | Runs the RSV tests; the RSV certificate (below) will need to be owned by this user | -| `cndrcron` | Runs the Condor Cron processes to schedule the running of the tests | - - -!!! note - if you pre-create the RSV user, it should have a working shell. That is, it shouldn't have a default shell of `/sbin/nologin`. - -!!! warning - - If you manage your `/etc/passwd` file with configuration management software such as Puppet, CFEngine or 411, make sure the UID and GID in `/etc/condor-cron/config.d/condor_ids` matches the UID and GID of the `cndrcron` user and group in `/etc/passwd`. If it does not, create a file named `/etc/condor-cron/config.d/condor_ids_override` with the contents: - - -```file -CONDOR_IDS=UID.GID -``` - -where `UID` and `GID` are the UID and GID of the `cndrcron` user and group. - -### Certificates - -| Certificate | User that owns certificate |Path to certificate | -|:------------------------|:----------------------------|:----------------------------------- | -| RSV service certificate | `rsv` |`/etc/grid-security/rsv/rsvcert.pem `| -| | |`/etc/grid-security/rsv/rsvkey.pem `| - -Ensure an RSV service certificate is installed in `/etc/grid-security/rsv/` and the certificate files are owned by the `rsv` user. Adjust the permissions if necessary (cert needs to be readable by all, key needs to be readable by nobody but owner). - -You may need another certificate owned by `apache` if you'd like an authenticated web server; see [Configuring the RSV web server to use HTTPS instead of HTTP](#configuring-the-rsv-web-server-to-use-https-instead-of-http) above. - -See [instructions](../security/host-certs.md) to request a service certificate. - -### Networking - -| Service Name | Protocol |Port Number | Inbound | Outbound | Comment | -|:--------------|:------------|:-----------|:--------|:---------|:--------| -| HTTP | tcp | 80 | YES | | RSV runs an HTTP server (Apache) that publishes a page with the RSV testing results | -| HTTP | tcp | 80 | | YES | RSV pushes testing results to the OSG Gratia Collectors at opensciencegrid.org | -| various | various | various | | YES | Allow outbound network connection to all services that you want to test | - - -Or, if you'd rather have your RSV web page appear as `https://...:8443/rsv/` like it used to in OSG 1.2, the first column above would be **HTTPS** / **tcp** / **8443**. See [above](#configuring-the-rsv-web-server-to-use-https-instead-of-http) for how to configure this. diff --git a/docs/monitoring/rsv-control.md b/docs/monitoring/rsv-control.md deleted file mode 100644 index 84fa08abb..000000000 --- a/docs/monitoring/rsv-control.md +++ /dev/null @@ -1,319 +0,0 @@ -Using rsv-control -================= - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -Overview --------- - -This document is for System Administrators. It details the usage of the `rsv-control` command for enabling, disabling, testing and running RSV probes. - -`rsv-control` provides an interface to many RSV tasks. `rsv-control` can view RSV jobs, run metrics, enable or disable metrics and consumers, and allow advanced configuration. - -!!! warning - `rsv-control` can be used to configure RSV as described here and in [the advanced configuration document](advanced-rsv-configuration.md). Most site admins will be able to configure RSV by editing `/etc/osg/config.d/30-rsv.ini` and running `osg-configure` as described in the [installation doc](install-rsv.md#configuring-rsv). - -Using `rsv-control` to configure is for advanced RSV use including enabling non-default metrics. Admins who don't use `rsv-control` for configuration can still use it to view their RSV jobs, run RSV tests, and help debug RSV problems. Anyone can view the jobs, but you must be root or the RSV user (`rsv` by default) to execute other commands, e.g. run, enable and disable probes, or to turn RSV on and off. - -Viewing RSV jobs ----------------- - -rsv-control provides two different views: viewing the *desired* state and viewing the current *actual* state. - -- Desired = what metrics and consumers will start the next time RSV is started -- Actual = what metrics and consumers are currently running - -### Desired state -To view the desired state, use the `--list` (`-l` for short) flag. This will create one table for each host showing the metrics that are enabled to run against that host. - -``` console -root@host# rsv-control --list - -Metrics enabled for host: osgitb1.nhn.ou.edu | Service -----------------------------------------------------------+-------------------- -org.osg.batch.jobmanager-default-status | OSG-CE -org.osg.batch.jobmanagers-available | OSG-CE -org.osg.certificates.cacert-expiry | OSG-CE -org.osg.certificates.crl-expiry | OSG-CE -org.osg.general.osg-directories-CE-permissions | OSG-CE -org.osg.general.osg-version | OSG-CE -org.osg.general.ping-host | OSG-CE -org.osg.general.vdt-version | OSG-CE -org.osg.general.vo-supported | OSG-CE -org.osg.globus.gram-authentication | OSG-CE -org.osg.globus.gridftp-simple | OSG-GridFTP -org.osg.gratia.condor | OSG-CE -org.osg.gratia.metric | OSG-CE - - -Metrics enabled for host: osg-edu.cs.wisc.edu:10443 | Service -----------------------------------------------------------+-------------------- -org.osg.srm.srmcp-readwrite | OSG-SRM -org.osg.srm.srmping | OSG-SRM -``` - - -Other options: - -- To view all installed metrics use the `--all` (`-a`) flag along with `--list`. This will print an extra table showing metrics that are disabled on all hosts. -- If you are having problems with the output being truncated, try the `--wide` (`-w`) flag. - -### Actual state -To view the current, running state of RSV jobs, use the `--job-list` flag (`-j` for short). This will show all metrics and consumers running in RSV. (It queries the underlying Condor Cron system that we use to run the metrics). - -``` console -root@host# rsv-control --job-list - -Hostname: osg-edu.cs.wisc.edu - ID OWNER ST NEXT RUN TIME METRIC - 154.0 rsvuser I 11-19 12:15 org.osg.certificates.cacert-expiry - 155.0 rsvuser R 11-19 11:23 org.osg.gratia.metric - 156.0 rsvuser I 11-19 18:47 org.osg.general.vdt-version - 157.0 rsvuser I 11-19 12:30 org.osg.certificates.crl-expiry - 158.0 rsvuser I 11-19 11:31 org.osg.globus.gram-authentication - 159.0 rsvuser I 11-19 11:41 org.osg.general.osg-version - 160.0 rsvuser R 11-19 11:25 org.osg.batch.jobmanager-default-status - 161.0 rsvuser I 11-20 04:59 org.osg.batch.jobmanagers-available - 162.0 rsvuser I 11-19 11:37 org.osg.general.osg-directories-CE-permissions - 163.0 rsvuser I 11-19 12:08 org.osg.globus.gridftp-simple - 164.0 rsvuser I 11-19 12:09 org.osg.gratia.condor - 165.0 rsvuser R 11-19 11:27 org.osg.general.ping-host - 166.0 rsvuser I 11-19 18:47 org.osg.general.vo-supported - -Hostname: osg-edu.cs.wisc.edu:10443 - ID OWNER ST NEXT RUN TIME METRIC - 113.0 rsvuser I 11-19 11:33 org.osg.srm.srmping - 114.0 rsvuser R 11-19 11:28 org.osg.srm.srmcp-readwrite - - ID OWNER ST CONSUMER - 198.0 rsvuser R html-consumer - 199.0 rsvuser R gratia-consumer -``` - -The ST field indicates the current job status: - -- R = the metric is currently running -- I = the metric is idle and will be run at the next scheduled interval -- Any other letter may indicate a problem -- Consumers will always appear to be running even though they will only run once every five minutes. - -Running a metric ----------------- - -`rsv-control` can be used to run metrics one time against a host. This can be useful for: - -- updating the status of a metric that had a problem instead of waiting until the next scheduled run time -- testing a metric against a host before deciding whether to enable it - -Note that **the record for each run will be published to all active consumers**. That is, it will be published to Gratia or will show up on your local web page, if you have those enabled. - -### Simplest test - -Use the `--run` (`-r`) flag. You must also provide the `--host` flag. The syntax is: - -`rsv-control --run --host [ ...]` - -where <METRIC> is the full metric name (e.g. `org.osg.general.osg-version`). You can get the metric names from the `--list` output. - -``` console -root@host# rsv-control --run \ - --host osg-edu.cs.wisc.edu org.osg.general.osg-version - -Running metric org.osg.general.osg-version: - -metricName: org.osg.general.osg-version -metricType: status -timestamp: 2010-11-19 11:40:19 CST -metricStatus: OK -serviceType: OSG-CE -serviceURI: osg-edu.cs.wisc.edu -gatheredAt: vdt-itb.cs.wisc.edu -summaryData: OK -detailsData: OSG 1.2.15 -EOT -``` - -Note the *metricStatus* in the example above: that's where you can see if it was successful or not. In this case, it was successful, because it printed OK. - -You may run multiple metrics against a single host by specifying multiple metrics to `rsv-control`. - -In order to run metrics against multiple hosts you must run `rsv-control` multiple times, once for each host. - -### Running all enabled metrics - -When RSV is first installed it can take up to a day for each enabled metric to run once. A new option is provided to force each metric to run immediately, for all hosts. Use the `--all-enabled` flag along with `--run`. With this option it is not necessary to specify a host - all enabled metrics for all configured hosts will be run (in fact, if you do specify a host it will be ignored). - -``` console -root@host# rsv-control -r --all-enabled - -Running metric org.osg.certificates.cacert-expiry (1 of 15) - -metricName: org.osg.certificates.cacert-expiry -metricType: status -timestamp: 2010-11-19 13:44:08 CST -metricStatus: OK -serviceType: OSG-CE -serviceURI: osg-edu.cs.wisc.edu -gatheredAt: vdt-itb.cs.wisc.edu -summaryData: OK -detailsData: Security Probe Version: 1.1 -OK: CAs are in sync with OSG distribution -EOT - - -... -``` - - -### Passing extra configuration - -If you want to pass extra configuration when running a metric without editing its configuration file you can make an INI-formatted file and pass it on the command line. For example, you can make a file like this for the `org.osg.srm.srmclient-ping` metric (tmp-srm.ini): - -``` dosini -[org.osg.srm.srmclient-ping args] -srm-destination-dir=/srmcache/~ -srm-webservice-path=srm/v2/server -``` - -Then use the `--extra-config-file` parameter and pass the path to the INI file: - -``` console -root@host# rsv-control -r --extra-config-file tmp-srm.ini \ - --host osg-edu.cs.wisc.edu:10443 org.osg.srm.srmclient-ping - -Running metric org.osg.srm.srmclient-ping: - -metricName: org.osg.srm.srmclient-ping -metricType: status -timestamp: 2010-11-19 14:12:35 CST -metricStatus: OK -serviceType: OSG-SRM -serviceURI: osg-edu.cs.wisc.edu:10443 -gatheredAt: vdt-itb.cs.wisc.edu -summaryData: OK -detailsData: SRM server running on osg-edu.cs.wisc.edu is alive and responding to the srmping command. -. Details: Storage Resource Manager (SRM) Client version 2.1.5-16 -Copyright (c) 2002-2009 Fermi National Accelerator Laboratory - -... -``` - -Enabling and disabling metrics and consumers --------------------------------------------- - -Metrics and consumers can be enabled or disabled by `rsv-control` using the `--enable` and `--disable` flags. Note that "enable" and "disable" are desired states (this is similar to `osg-control`). After enabling a metric you should turn it on if you want it to be running immediately. After disabling a metric that is running, you should still turn it off (a message will print after each of these actions to remind you of this behavior). - -### Enabling - -The syntax for enabling metrics looks similar to the syntax for running metrics: - -`rsv-control --enable --host [ ...]` - -You must provide a host to enable the metric against (in order to enable a metric on multiple hosts you must run `rsv-control` once per host). - - -``` console -root@host# rsv-control --enable \ - --host osg-edu.cs.wisc.edu org.osg.gip.consistency -Enabling metric 'org.osg.gip.consistency' for host 'osg-edu.cs.wisc.edu' - -One or more metrics have been enabled and will be started the next time RSV is started. To turn them on immediately run 'rsv-control --on'. -``` - -Consumers do not run against a specific host, they process records for all hosts. When enabling consumers a host is not required (if a host is passed it will be ignored). - - -``` console -root@host# rsv-control --enable nagios-consumer -Enabling consumer nagios-consumer -``` - - - -### Disabling - -The syntax for disabling metrics looks similar to the syntax for running metrics: - -`rsv-control --disable --host [ ...]` - -You must provide a host to disable the metric against (in order to disable a metric on multiple hosts you must run `rsv-control` once per host). - - -``` console -root@host# rsv-control --disable \ - --host vdt-itb.cs.wisc.edu org.osg.local.containercert-expiry -Disabling metric 'org.osg.local.containercert-expiry' for host 'vdt-itb.cs.wisc.edu' - -One or more metrics have been disabled and will not start the next time RSV is started. You may still need to turn them off if they are currently running. -``` - -Consumers do not run against a specific host, they process records for all hosts. When disabling consumers a host is not required (if a host is passed it will be ignored). - -``` console -root@host# rsv-control --disable html-consumer gratia-consumer -Disabling consumer html-consumer -Disabling consumer gratia-consumer - Consumer already disabled -``` - - -Metrics and consumers can both be listed in the same disable command. - -Troubleshooting ---------------- - -### Getting more information from rsv-control -The first step to getting more information is to run `rsv-control` with more verbosity. Use the `--verbose` (`-v`) flag. This flag can be used with any of rsv-control's abilities (run, enable, list, etc). The verbosity levels are: - -- 0 = print nothing -- 1 = print warnings and errors along with usual output of command being run (1 is the default level) -- 2 = adds informational messages -- 3 = full debugging output - - -### Using the RSV verify tool -The `--verify` flag will run some basic checks for your RSV installation: - -``` console -root@host# rsv-control --verify -Testing if Condor-Cron is running... -OK - -Testing if metrics are running... -OK (98 running metrics) - -Testing if consumers are running... -OK (1 running consumers) - -Checking which consumers are configured... -The following consumers are enabled: html-consumer -WARNING: The gratia-consumer is not enabled. This indicates that your - resource is not reporting to OSG. -``` - - -This tool is still under development and it does only basic checks, but it is a good first step when debugging issues. - - -### Running the RSV profiler - -RSV has a tool to collect information useful for troubleshooting into a tarball that can be shared with the developers and support staff. -To use it: - -``` console -root@host# rsv-control --profile -Running the rsv-profiler... -OSG-RSV Profiler -Analyzing... -Making tarball (rsv-profiler.tar.gz) -``` - -!!! note - If you are getting assistance via the trouble ticket system, you must add a `.txt` extension to the tarball so it can be uploaded. - - :::console - root@host# mv rsv-profiler.tar.gz rsv-profiler.tar.gz.txt - - diff --git a/docs/other/gsissh.md b/docs/other/gsissh.md deleted file mode 100644 index f90d5e6ea..000000000 --- a/docs/other/gsissh.md +++ /dev/null @@ -1,156 +0,0 @@ -title: Installing and Maintaining GSI OpenSSH - -Installing and Maintaining GSI OpenSSH -======================================= - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -This document contains instructions to install and configure the GSI OpenSSH server available in the OSG repository for -use on your cluster. - -Before Starting ---------------- - - -Before starting the installation process, consider the following points (consulting [the Reference section -below](#reference) as needed): - -- **User IDs:** If they do not exist already, the installation will create the Linux users `gsisshd` and `gsisshd` - -As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: - -- Ensure the host has [a supported operating system](../release/supported_platforms.md) -- Obtain root access to the host -- Prepare the [required Yum repositories](../common/yum.md) -- Install [CA certificates](../common/ca.md) - -Installing GSI OpenSSH ----------------------- - -Install the GSI OpenSSH rpms: - -``` -root@server # yum install gsi-openssh-server gsi-openssh-clients -``` - -Configuring GSI OpenSSH ------------------------ - -In order to get a running instance of the GSI OpenSSH server, you'll need to change the default configuration. -However, before you go any further, you'll need to decide whether you want GSI OpenSSH to be your primary ssh service or -not (e.g. whether the GSI OpenSSH service will replace your existing SSH service). -Regardless of your choice, you should probably have both services use the same host keys. -This can be done by running the following commands : - -```console -root@host # cd /etc/gsissh -root@host # ln -s /etc/ssh/ssh_host_rsa_key ssh_host_rsa_key -root@host # ln -s /etc/ssh/ssh_host_rsa_key.pub ssh_host_rsa_key.pub -root@host # ln -s /etc/ssh/ssh_host_dsa_key ssh_host_dsa_key -root@host # ln -s /etc/ssh/ssh_host_dsa_key.pub ssh_host_dsa_key.pub -root@host # ln -s /etc/ssh/ssh_host_ecdsa_key ssh_host_ecdsa_key -root@host # ln -s /etc/ssh/ssh_host_ecdsa_key.pub ssh_host_ecdsa_key.pub -root@host # ln -s /etc/ssh/ssh_host_ed25519_key ssh_host_ed25519_key -root@host # ln -s /etc/ssh/ssh_host_ed25519_key.pub ssh_host_ed25519_key.pub -``` - -!!! note - Your system may not have all of these host keys - -If you choose not to replace your existing SSH service, you'll need to change the port setting in the GSI OpenSSH -configuration to another port (e.g. 2222) so that you can run both SSH services at the same time. -This can be done by editing `/etc/gsissh/sshd` and setting `Port 2222`. - - - -!!! note - Regardless of the authorization method used for the user, any - account that will be used with GSI OpenSSH must have a shell - assigned to it and not be locked (e.g., have `!` in the password field of `/etc/shadow`). - -### Configuring authentication - -To configure authentication for GSI OpenSSH, follow the instructions in [the LCMAPS VOMS plugin document](../security/lcmaps-voms-authentication.md#configuring-the-lcmaps-voms-plugin) -to prepare the LCMAPS VOMS plugin. - -Using GSI OpenSSH ------------------- - -The following table gives the commands needed to start, stop, enable, and disable GSI OpenSSH. - -| To... | Run the command... | -| :-------------------------------------- | :---------------------------- | -| Start the service | `systemctl start gsisshd` | -| Stop the service | `systemctl stop gsisshd` | -| Enable the service to start on boot | `systemctl enable gsisshd` | -| Disable the service from starting on boot | `systemctl disable gsisshd` | - - -Validating GSI OpenSSH ----------------------- - -After starting the `gsisshd` service you can check if it is running correctly - -``` console -user@client $ grid-proxy-init -Your identity: /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=User Name -Enter GRID pass phrase for this identity: -Creating proxy ............................................................................................... Done -Your proxy is valid until: Sat Apr 23 08:18:27 2016 -$ gsissh localhost -p 2222 -Last login: Tue Sep 18 16:08:03 2012 from itb4.uchicago.edu -$ -``` - -Troubleshooting ---------------- - -You can get information on troubleshooting errors on the [NCSA page](http://grid.ncsa.illinois.edu/ssh/ts_server.html). - -To troubleshoot LCMAPS authorization, you can add the following to `/etc/sysconfig/gsisshd` and choose a higher debug -level: - -``` bash -# level 0: no messages, 1: errors, 2: also warnings, 3: also notices, -# 4: also info, 5: maximum debug -LCMAPS_DEBUG_LEVEL=2 -``` - -Output goes to `/var/log/messages` or `journalctl` by default. - - -Help ----- - -To get assistance please use this [Help Procedure](../common/help.md). - - -Reference ----------- - -### Useful configuration and log files - -Configuration Files - -| Service or Process | Configuration File | Description | -|:-------------------|:--------------------------|:----------------------------------| -| gsisshd | `/etc/gsissh/sshd_config` | Configuration file | -| gsisshd | `/etc/sysconfig/gsisshd` | Environment variables for gsisshd | -| gsisshd | `/etc/lcmaps.db` | LCMAPS configuration | - -Log Files - -| Service or Process | Log File | Description | -|:-------------------|:--------------------|:-----------------| -| gsisshd | `/var/log/messages` | All log messages | - -Other Files - -| Service or Process | File | Description | -|:-------------------|:----------------------------------|:-----------------| -| gsisshd | `/etc/grid-security/hostcert.pem` | Host certificate | -| gsisshd | `/etc/grid-security/hostkey.pem` | X.509 host key | -| gsisshd | `/etc/gsissh/ssh_host_rsa_key` | RSA Host key | - diff --git a/docs/security/lcmaps-voms-authentication.md b/docs/security/lcmaps-voms-authentication.md deleted file mode 100644 index 2cc6417e3..000000000 --- a/docs/security/lcmaps-voms-authentication.md +++ /dev/null @@ -1,486 +0,0 @@ -title: Installing and Maintaining the LCMAPS VOMS Plugin - -Installing and Maintaining the LCMAPS VOMS Plugin -================================================= - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -LCMAPS is a software library used on [HTCondor-CE](../compute-element/install-htcondor-ce.md), [GridFTP](../data/gridftp.md), and -[XRootD](../data/xrootd/install-storage-element.md) hosts for mapping grid certificates of incoming connections to specific -Unix accounts. -The LCMAPS VOMS plugin enables LCMAPS to make mapping decisions based on the VOMS attributes of grid certificates, e.g. -`/cms/Role=production/Capability=NULL`. - -The OSG provides a default set of mappings from VOMS attributes to Unix accounts. -By configuring LCMAPS, you can override these mappings, including changing the Unix account that a VO is mapped to; -adding custom mappings for specific users and VOMS attributes; and/or banning specific users and VOMS attributes. - -Use this page to learn how to install and configure the LCMAPS VOMS plugin to authenticate users to access your -resources on a per-VO basis. - - -Installing the LCMAPS VOMS Plugin ---------------------------------- - -To install the LCMAPS VOMS plugin, make sure that your host is up to date before installing the required packages: - -1. Clean yum cache: - - ::console - root@host # yum clean all --enablerepo=* - -2. Update software: - - :::console - root@host # yum update - - This command will update **all** packages - -1. Install `lcmaps`, the default mapfile, and the configuration tools: - - :::console - root@host # yum install lcmaps vo-client-lcmaps-voms osg-configure-misc - - -Configuring the LCMAPS VOMS Plugin ----------------------------------- - -The following section describes the steps required to configure the LCMAPS VOMS plugin for authentication. -Additionally, there are [optional configuration](#optional-configuration) instructions if you need to make changes to -the default mappings. - -### Supporting mapped VOs and users - - Ensure Unix accounts exist for each VO, VO role, VO group, or user you choose to support in the [mapfiles](#configuration-files): - -1. Consult the default VO mappings in `/usr/share/osg/voms-mapfile-default` to determine the mapped Unix account names. - Each of the mapfiles has the following format: - - "" - - -1. Create Unix accounts for each VO, VO role, VO group, and user that you wish to support. - The full list of VOs is located in the [OSG topology](https://github.com/opensciencegrid/topology/tree/master/virtual-organizations). - You are not expected to support all the VOs. - If you would like to support opportunistic usage, we recommend creating the following Unix accounts: - - | **VO name** | **Unix account(s)** | - |---------------------------------------------------------------------------------------------------------|---------------------| - | [GLOW](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/GLOW.yaml) | `glow` | - | [OSG](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/OSG.yaml) | `osg` | - | [ATLAS](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/ATLAS.yaml) | `usatlas3` | - | [CMS](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/CMS.yaml) | `cmsuser` | - | [Fermilab](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/Fermilab.yaml) | `fnalgrid` | - | [HCC](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/HCC.yaml) | `hcc` | - | [Gluex](https://github.com/opensciencegrid/topology/blob/master/virtual-organizations/Gluex.yaml) | `gluex` | - -1. Edit `/etc/osg/config.d/30-gip.ini` and specify the supported VOs per [Subcluster or ResourceEntry section](../other/configuration-with-osg-configure.md#subcluster-resource-entry-for-agis-glideinwms-entry): - - :::ini - allowed_vos="VO1,VO2..." - -### Applying configuration settings - -Making changes to the OSG configuration files in the `/etc/osg/config.d` directory does not apply those settings to -software automatically. -For the OSG settings, use the [osg-configure](../other/configuration-with-osg-configure.md) tool to validate (to a limited -extent) and apply the settings to the relevant software components. -If instead you wish to manage the LCMAPS VOMS plugin configuration yourself, skip to the -[manual configuration section](#manual-configuration). - -1. Make all changes to `.ini` files in the `/etc/osg/config.d` directory. - - !!!note - This document only describes the critical settings for the LCMAPS VOMS plugin and related software. - You may need to configure other software that is installed on your host, too. - -1. Validate the configuration settings: - - :::console - root@host # osg-configure -v - -1. Once the validation command succeeds without errors, apply the configuration settings: - - :::console - root@host # osg-configure -c - - - -### Optional configuration - -The following subsections contain information on mapping or banning users by their -certificates' Distinguished Names (DNs) or by their proxies' VOMS attributes. -Any optional configuration is to be performed after the installation and configuration sections above. - -For a table of the configuration files and their order of evaluation, consult the [reference section](#configuration-files). - -- [Mapping VOs](#mapping-vos) -- [Mapping users](#mapping-users) -- [Banning VOs](#banning-vos) -- [Banning users](#banning-users) -- [Mapping using all FQANs](#mapping-using-all-fqans) - -#### Mapping VOs - -To map VOs, VO roles, or VO groups to Unix accounts based on their VOMS attributes, create `/etc/grid-security/voms-mapfile`. -An example of the format of a `voms-mapfile` follows: - -``` -# map GLOW jobs in the chtc group to the 'glow1' Unix account. -"/GLOW/chtc/*" glow1 -# map GLOW jobs with the htpc role to the 'glow2' Unix account. -"/GLOW/Role=htpc/*" glow2 -# map other GLOW jobs to the 'glow' Unix account. -"/GLOW/*" glow -``` - -Each non-commented line is a shell-style pattern which is compared against the user's VOMS attributes, and a Unix -account that the user will be mapped to if the pattern matches. -The patterns are compared in the order they are listed in. Therefore, more general patterns should be placed later in -the file. - -!!!note - The Unix account must exist for the user to be mapped. - If a VO's Unix account is missing, that VO will not be able to access your resources. - - Additionally, if you map VOMS attributes to a non-existent user in `/etc/grid-security/voms-mapfile`, - `/usr/share/osg/voms-mapfile-default` will be considered next to find a mapping. - The best way to ban a VO is edit `/etc/grid-security/ban-voms-mapfile` as described in [Banning VOs](#banning-vos) - below. - Do not edit `voms-mapfile-default` as your changes will be overwritten upon updates. - - -#### Mapping users - -To map specific users to Unix accounts based on their certificates' DNs, create `/etc/grid-security/grid-mapfile`. - -!!!note - The openssl version 1.1.x command prints the subject DN in a slightly different format. - OpenSSL version 1.1 is present on Enterprise Linux 8 systems. - The new format is a comma separated list of attributes. - You must convert that back to the older format for our map files. - Each attribute must start with a `/` and there are no spaces around the `=` and remove the comma between attributes: - - ``` - DC = org, DC = opensciencegrid, O = Open Science Grid, OU = People, CN = Matyas Selmeci - ``` - - should be written as: - - ``` - /DC=org/DC=opensciencegrid/O=Open Science Grid/OU=People/CN=Matyas Selmeci - ``` - - -An example of the format of a `grid-mapfile` follows: - -``` -# map Matyas's FNAL DN to the 'matyas' Unix account -"/DC=gov/DC=fnal/O=Fermilab/OU=People/CN=Matyas Selmeci/CN=UID:matyas" matyas -``` - - -!!! note - The Unix account must exist for the user to be mapped. If a user's Unix account is missing, that user will not be - able to access your resources. - - -#### Banning VOs - -`/etc/grid-security/ban-voms-mapfile` is used to ban an entire VO or a role withing a VO from accessing resources on -your machine. -An example of the format of a `ban-voms-mapfile` follows: - -``` -# ban CMS production jobs -"/cms/Role=production/*" -``` - -Each non-commented line is a shell-style pattern which is compared against a user's VOMS attributes. -If the pattern matches, that user will be unable to access your resources. - -!!!danger - When banning VOs, you must restart the services using LCMAPS VOMS authentication (e.g. `condor-ce`, - `globus-gridftp-server`, `xrootd`, etc.) to clear any authentication caches. In the case of XRootD - when the service is not restarted the change could take up to 12hrs to take effect. This can be - modified by defining the `authzto` option in the `sec.protocol` configuration attribute, e.g.: - - sec.protocol /usr/lib64 gsi \ - -certdir:/etc/grid-security/certificates \ - -cert:/etc/grid-security/xrd/xrdcert.pem \ - ... - -authzto:3600 - - The units of `-authzto` are in seconds which means that the above will set the LCMAPS cache lifetime to 1hr. - -!!!warning - `/etc/grid-security/ban-voms-mapfile` *must* exist, even if you are not banning any VOs. - In that case, the file should not contain any entries. If the file does not exist, LCMAPS will ban every user. - - -#### Banning users - -`/etc/grid-security/ban-mapfile` is used to ban specific users from accessing your resources based on their -certificates' DNs. An example of the format of a `ban-mapfile` follows: - -``` -# ban Matyas's FNAL DN -"/DC=gov/DC=fnal/O=Fermilab/OU=People/CN=Matyas Selmeci/CN=UID:matyas" -``` - -!!!danger - When banning users, you must restart the services using LCMAPS VOMS authentication (e.g. `condor-ce`, - `globus-gridftp-server`, `xrootd`, etc.) to clear any authentication caches. In the case of XRootD - when the service is not restarted the change could take up to 12hrs to take effect. This can be - modified by defining the `authzto` option in the `sec.protocol` configuration attribute, e.g.: - - sec.protocol /usr/lib64 gsi \ - -certdir:/etc/grid-security/certificates \ - -cert:/etc/grid-security/xrd/xrdcert.pem \ - ... - -authzto:3600 - - The units of `-authzto` are in seconds which means that the above will set the LCMAPS cache lifetime to 1hr. - -!!!warning - `/etc/grid-security/ban-mapfile` *must* exist, even if you are not banning any users. - In that case, the file should be blank. If the file does not exist, LCMAPS will ban every user. - - - -### Mapping using all FQANs - -By default, the LCMAPS VOMS plugin only considers the first FQAN of a VOMS proxy for mapping. -If you want to consider all FQANs, you must set the appropriate option. - -- If you are using osg-configure, set `all_fqans = True` in `10-misc.ini`, then run `osg-configure -c` - -- If you are configuring `lcmaps.db` manually (see [manual configuration](#manual-configuration) below), - add `"-all-fqans"` to the module definitions for `vomsmapfile` and `defaultmapfile` - -Using the LCMAPS VOMS Plugin ----------------------------- - -LCMAPS is a software library that is called for authentication; -therefore, there are no running services and it does not have to be invoked manually. - -Validating the LCMAPS VOMS Plugin VO Mappings ---------------------------------------------- - -To validate the LCMAPS VOMS plugin by itself, use the following procedure to test mapping your own cert to a user: - -1. Verify your DN is *not* in `/etc/grid-security/grid-mapfile`, or else it will generate a false positive -1. Verify your DN is *not* in `/etc/grid-security/ban-mapfile`, or else it will generate a false negative -1. Install the `llrun` and `voms-clients` packages: - - :::console - root@host # yum install llrun voms-clients - -1. As an unprivileged user, create a VOMS proxy (filling in `` with a VO you are a member of): - - :::console - user@host $ voms-proxy-init -voms - -1. Verify that your credentials are mapped as expected: - - :::console - user@host $ llrun -s -l mode=pem,policy=authorize_only,db=/etc/lcmaps.db \ - -p/tmp/x509up_u`id -u` - -If you did not get correctly mapped, check your proxy's FQAN by running: -``` console -user@host $ voms-proxy-info -fqan -``` -and make sure it matches one of the patterns in `/etc/grid-security/voms-mapfile` or -`/usr/share/osg/voms-mapfile-default`, and does not match any patterns in `/etc/grid-security/ban-voms-mapfile`. - -Troubleshooting the LCMAPS VOMS Plugin --------------------------------------- - -LCMAPS logs to `journalctl` and the verbosity of the logging can be increased by modifying the appropriate -configuration and restarting the relevant service. -This section outlines the configuration necessary to raise the debug level for the different hosts that can use LCMAPS -VOMS authentication as well as common LCMAPS VOMS authentication issues. - -### HTCondor-CE hosts ### - -If you are troubleshooting an HTCondor-CE host, follow these instructions to raise the LCMAPS debug level: - -1. Add the following text to `/etc/sysconfig/condor-ce`: - - :::bash - export LCMAPS_DEBUG_LEVEL=5 - # optional (uncomment the following line to output log messages to a file): - # export LCMAPS_LOG_FILE=/tmp/lcmaps.log - -1. Disable HTCondor-CE authentication caches by creating `/etc/condor-ce/config.d/99-disablegsicache.conf` with the - following contents: - - GSS_ASSIST_GRIDMAP_CACHE_EXPIRATION = 0 - -1. Restart the [condor-ce](https://htcondor.github.io/htcondor-ce/v5/verification/#managing-htcondor-ce-services) service - -!!! tip - After you've completed troubleshooting, remember to revert the changes above and restart services! - -### XRootD hosts ### - -If you are troubleshooting an XRootD host, follow these instructions to raise the LCMAPS debug level: - -1. Choose the configuration file to edit based on the following table: - - | If you are running XRootD in... | Then modify the following file... | - |:--------------------------------|:------------------------------------| - | Standalone mode | `/etc/xrootd/xrootd-standalone.cfg` | - | Clustered mode | `/etc/xrootd/xrootd-clustered.cfg` | - -1. Set `loglevel=5` under the `-authzfunparms` of the `sec.protocol /usr/lib64 gsi` line. For example: - - :::file hl_lines="6" - sec.protocol /usr/lib64 gsi -certdir:/etc/grid-security/certificates \ - -cert:/etc/grid-security/xrootd/xrootdcert.pem \ - -key:/etc/grid-security/xrootd/xrootdkey.pem \ - -crl:1 \ - -authzfun:libXrdLcmaps.so \ - -authzfunparms:lcmapscfg=/etc/xrootd/lcmaps.cfg,loglevel=5,policy=authorize_only \ - -gmapopt:10 -gmapto:0 - -1. Restart the [xrootd](../data/xrootd/install-storage-element.md#managing-xrootd-services) service - -!!! tip - After you've completed troubleshooting, remember to revert the changes above and restart services! - -### GridFTP hosts ### - -If you are troubleshooting a GridFTP host, follow these instructions to raise the LCMAPS debug level: - -1. Add the following text to `/etc/sysconfig/globus-gridftp-server`: - - :::bash - export LCMAPS_DEBUG_LEVEL=5 - # optional (uncomment the following line to output log messages to a file): - # export LCMAPS_LOG_FILE=/tmp/lcmaps.log - -1. Restart the [globus-gridftp-server](../data/gridftp.md#managing-gridftp) service. - -!!! tip - After you've completed troubleshooting, remember to revert the changes above and restart services! - -### Common issues - -#### A user/VO still has access to my XRootD server after adding them to the ban files -The best way to ensure that a user/VO is immediately banned is to restart the XRootD server after adding the DN or VOMS attributes to the corresponding ban file. -If the above is not possible, the the lifetime of the LCMAPS cache for XRootD can be controlled by setting the parameter `authzto` -within the `sec.protocol` configuration attribute, e.g.: - - sec.protocol /usr/lib64 gsi \ - -certdir:/etc/grid-security/certificates \ - -cert:/etc/grid-security/xrd/xrdcert.pem \ - ... - -authzto:3600 - -The units of `-authzto` are in seconds which means that the above will set the LCMAPS cache lifetime to 1hr. -The default value for this parameter is 12hrs. - - - -#### Wrong version of GridFTP - -If you have the EPEL version of the GridFTP server, you may see error messages in `journalctl` -or the location specified by `LCMAPS_LOG_FILE`. - -**Symptoms** - -``` -Apr 11 13:51:41 atlas-hub globus-gridftp-server: You are still root after the LCMAPS execution. The implicit root-mapping safety is enabled. See documentation for details -``` - -**Next actions** - -1. If the versions of the `globus-gridftp-server-*` packages do not end in `osgXX.elY`, - continue with these instructions. - To check the version of your `globus-gridftp-server-*`, run the following command: - - :::console - user@host $ rpm -qa 'globus-gridftp*' - -1. Verify that the [priority](../common/yum.md#install-the-yum-priorities-plugin-el7) of the OSG repositories are set - properly - -1. Clean your yum cache - - :::console - root@host # yum clean all --enablerepo=* - -1. Reinstall `globus-gridftp-server`: - - :::console - root@host # yum update globus-gridftp-server - -Getting Help ------------- - -To get assistance, please use the [this page](../common/help.md). - -Reference ---------- - -### Configuration Files - -The files are evaluated in the following order, with earlier files taking precedence over later ones: - -| File | Provider | Purpose | -|:--------------------------------------|:---------|:------------------| -| `/etc/grid-security/ban-mapfile` | Admin | Ban DNs | -| `/etc/grid-security/ban-voms-mapfile` | Admin | Ban VOs | -| `/etc/grid-security/grid-mapfile` | Admin | Map DNs | -| `/etc/grid-security/voms-mapfile` | Admin | Map VOs | -| `/usr/share/osg/voms-mapfile-default` | OSG | Map VOs (default) | - -!!! warning - `/usr/share/osg/voms-mapfile-default` is not meant to be edited and will be overwritten on upgrades. - All VO mappings can be overridden by editing the above files in `/etc/grid-security`. - -### Manual Configuration - -This section is intended for use as reference if you choose to forego configuring the LCMAPS VOMS plugin via -osg-configure (i.e., if you prefer a configuration management system like [Ansible](https://www.ansible.com/) or -[Puppet](https://puppet.com/)). -Therefore, the following instructions serve as a replacement for [this section](#applying-configuration-settings) above. - -LCMAPS is configured in `/etc/lcmaps.db` and since the VOMS plugin is a newer component, configuration for it may not -be present in your existing `/etc/lcmaps.db` file. - -1. Ensure the following lines are present in the "Module definitions" section (the top section, before - `authorize_only`) of `/etc/lcmaps.db`: - - gridmapfile = "lcmaps_localaccount.mod" - "-gridmap /etc/grid-security/grid-mapfile" - banfile = "lcmaps_ban_dn.mod" - "-banmapfile /etc/grid-security/ban-mapfile" - banvomsfile = "lcmaps_ban_fqan.mod" - "-banmapfile /etc/grid-security/ban-voms-mapfile" - vomsmapfile = "lcmaps_voms_localaccount.mod" - "-gridmap /etc/grid-security/voms-mapfile" - defaultmapfile = "lcmaps_voms_localaccount2.mod" - "-gridmap /usr/share/osg/voms-mapfile-default" - - verifyproxynokey = "lcmaps_verify_proxy2.mod" - "--allow-limited-proxy" - "--discard_private_key_absence" - " -certdir /etc/grid-security/certificates" - -1. Edit the `authorize_only` section so that it contains only the following uncommented lines: - - verifyproxynokey -> banfile - banfile -> banvomsfile | bad - banvomsfile -> gridmapfile | bad - gridmapfile -> good | vomsmapfile - vomsmapfile -> good | defaultmapfile - defaultmapfile -> good | bad - -1. Edit `/etc/grid-security/gsi-authz.conf` and ensure that it contains the following line with a newline at the end: - - globus_mapping liblcas_lcmaps_gt4_mapping.so lcmaps_callout diff --git a/docs/security/user-certs.md b/docs/security/user-certs.md deleted file mode 100644 index 5649a1292..000000000 --- a/docs/security/user-certs.md +++ /dev/null @@ -1,207 +0,0 @@ -title: User Certificates - -User Certificates -================= - -!!! warning - This document is for software that will no longer be supported after the OSG 3.5 retirement (beginning of May 2022). - See the [Release Series Support Policy](https://opensciencegrid.org/technology/policy/release-series/) for details. - -!!! note - This document describes how to get and set up a **personal** certificate (also called a grid user certificate). - For instructions on how to get **host** certificates, see the [Host Certificates document](host-certs.md). - -Getting a User Certificate --------------------------- - -This section describes how to get and set up a personal certificate to use on OSG. -You need a user certificate if you are going to interact directly with OSG resources or infrastructure, -including activities such as: - -- Managing OASIS -- Directly running jobs on OSG resources -- Directly interacting with OSG storage elements -- Obtaining private contact information from OSG systems - -Currently, you can get a user certificate from CILogon. -You may also be able to use other CAs to get a certificate; if your virtual organization (VO) requires that you get a -certificate from a different CA, [contact your VO Support Center](https://github.com/opensciencegrid/topology/tree/master/virtual-organizations) for -instructions. - -### Know your responsibilities - -If your account or user certificate is compromised, you **must** notify the issuer of your certificate. -In addition, you should update your certificate and revoke the old certificate if any of the information in the -certificate (such as name or email address) change. -For the CILogon RA send email to [ca@cilogon.org](mailto:ca@cilogon.org). -Additional responsibilities required by the CILogon CA are given on [their page](http://ca.cilogon.org/responsibilities). - - -### Getting a certificate from CILogon - -You will have to obtain your user certificate using the [CILogon web UI](https://cilogon.org/). -Follow the steps below to get an user certificate: - -1. Open the CILogon page, , in your browser of choice -1. First, either search for your institution and select it or scroll through list and do the same. - - ![Institution Selection](..//img/cilogon_select_idp.png). - - !!! warning - Do not use Google, GitHub, or ORCID as providers since they are not widely supported in the OSG. - If your institution is not on the list, please contact your institution's IT support to see if they can support - CILogon. - -1. Click the `Log On` button and enter your institutional credentials if prompted. -1. After successfully entering your credentials, click on the "Create Password-Protected Certificate" link -1. Enter a password that is at least 12 characters long and then click on the `Get New Certificate` button. -1. Click the `Download Your Certificate` button to download your certificate in `.p12` format. - The certificate will be protected using the password you entered in the previous step. - - -### Certificate formats - -Your user certificate can be stored in a few different formats. -The two most common formats used in OSG are the [PKCS12](https://en.wikipedia.org/wiki/PKCS_12) and -[PEM](https://en.wikipedia.org/wiki/Privacy-Enhanced_Mail) formats. -In the PEM format, your user certificate is stored in two separate files: one for the certificate and another for the -private key. -The PKCS12 format stores the certificate and private key in a single file along with an optional certificate chain. -Most OSG user tools will work with both but will try to use PEM files first. - -To convert a PKCS12 file to PEM files, do the following. - -1. First, extract your user certificate from your PKCS12 file by running the following command. You'll be prompted for the password you used to create the certificate. The invocation assumes that the PKCS12 file is called `usercred.p12`. After running, the PEM certificate will be written to `usercert.pem`. - - :::console - user@host $ openssl pkcs12 -in usercred.p12 -out usercert.pem -nodes -clcerts -nokeys - Enter Import Password: - MAC verified OK - -1. Second, extract the private key by running the following command. You'll be prompted for two different passwords. The first prompt will be for the password that you used to create the certificate. The second prompt will be for the password that will encrypt the PEM certificate that will be created. As before, the invocation assumes that your PKCS12 certificate is located in `usercred.p12`. After running, the PEM certificate with your private key will be written to `userkey.pem`. - - :::console - user@host $ openssl pkcs12 -in usercred.p12 -out userkey.pem -nocerts - Enter Import Password: - MAC verified OK - Enter PEM pass phrase: - Verifying - Enter PEM pass phrase: - -Using Your User Certificate ---------------------------- - -1. The standard location to place user certificates is in the users certificate home directory in the `.globus` directory: - - :::console - user@host $ cp mkdir ~/.globus - user@host $ cp userkey.pem ~/.globus/ - user@host $ cp usercert.pem ~/.globus/ - user@host $ cp usercred.p12 ~/.globus/ - -1. To generate a proxy use the command `voms-proxy-init`. - - :::console - user@host $ voms-proxy-init - -1. (Optional) If user certificates are not in the `.globus` then the path has to be passed to `voms-proxy-init` - - :::console - user@host $ voms-proxy-init --cert //usercert.pem --key //userkey.pem - -1. In order to find the Distinguished Name (DN), issuer and lifetime of a certificate: - - :::console - user@host $ openssl x509 -in //usercert.pem -noout -subject -issuer -enddate - -!!! note - For admins trying to validate a service add your user DN to the [grid-map file](lcmaps-voms-authentication.md#mapping-users) of the service. - - -Revoking Your User Certificate ------------------------------- - -If the security of your certificate or private key has been compromised, you have a responsibility to revoke the certificate. -In addition, if your name or email address changes, you must revoke your certificate and get a new one with the correct -information. - -If you have a CILogon issued certificate, contact [ca@cilogon.org](mailto:ca@cilogon.org) in order revoke your certificate. -If you received a certificate from another CA, please contact the CA to initiate a certificate revocation. - - -Getting a Certificate from a Service Provider with cigetcert ------------------------------------------------------------- - -You may also get a user certificate from a SAML 2.0 Service Provider such as your home institution or XSEDE. -This kind of certificate is short-lived, typically valid only for a week. -Therefore it is not suitable for using in your browser. -However, it is useful for command-line access to site services such as compute or storage. - -You will need to use the `cigetcert` tool to get a certificate this way. -Use yum to install the `cigetcert` package from the OSG repositories. - -This is a new way of getting a certificate and does not work with all institutions. -To get a list of institutions supported by `cigetcert`, run: -```console -user@host $ cigetcert --listinstitutions -Clemson University -Fermi National Accelerator Laboratory -LIGO Scientific Collaboration -LTER Network -... -``` - -To get a certificate, run -```console -user@host $ cigetcert -i "" -Authorizing ...... authorized -Fetching certificate ..... fetched -Storing certificate in /tmp/x509up_u46142 -Your certificate is valid until: Fri Apr 13 17:03:13 2018 -``` -Authentication is controlled by the institution; -depending on the institution, you may need a valid Kerberos token, or will be prompted for a password. - -If all goes well, you should see output similar to what's above. -The certificate is created in `/tmp/x509up_u`, which is the same place proxies are created by `grid-proxy-init`. - -You may specify default arguments in the `CIGETCERTOPTS` environment variable. -This can save you from having to type in the entire institution name every time you want a cert. -For example, to always use FNAL as the institution, put this in your `.bashrc`: -```bash -export CIGETCERTOPTS="-i 'Fermi National Accelerator Laboratory" -``` - -Your VO may also provide specific instructions for how to best use this tool. -Contact your VO support center for details. - -Finally, `cigetcert` has advanced features, such as the ability to load configuration from a server, or store the cert on a MyProxy server. -See the [manual page for cigetcert](http://htmlpreview.github.io/?https://github.com/fermitools/cigetcert/blob/master/cigetcert.html) for more information. - - -### Using cigetcert with XSEDE credentials - -`cigetcert` also works with XSEDE as the service provider. -To use XSEDE credentials, you will first need an account at . -In addition, you _need_ to set up two-factor authentication with XSEDE; see their [MFA documentation](https://portal.xsede.org/mfa) for details. -Push notifications using the Duo Mobile app are required. - -Once you have set all those up, run `cigetcert` as follows: -```console -user@host $ cigetcert -u -i XSEDE -``` -`` is your username at portal.xsede.org. -You will get prompted to "Enter XSEDE Kerberos Password." -Enter the password for your account at portal.xsede.org. -You should then get a 2FA authentication request with Duo Mobile; once you accept this, `cigetcert` will issue the certificate. - -Getting Help ------------- - -To get assistance, please use the [this page](../common/help.md). - - -References ----------- - -- [Useful Documentation.OpenSSL commands (from NCSA)](http://security.ncsa.illinois.edu/research/grid-howtos/usefulopenssl.html) - e.g. how to convert the format of your certificate. -- [Manual page for cigetcert](http://htmlpreview.github.io/?https://github.com/fermitools/cigetcert/blob/master/cigetcert.html) From c962638612b6858c63a3adb69b3249ee648f7873 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 14:04:10 -0500 Subject: [PATCH 3/6] grid out gridftp.md references (SOFTWARE-5162) --- docs/data/xrootd/install-storage-element.md | 53 --------------------- docs/detailed-overview.md | 4 -- 2 files changed, 57 deletions(-) diff --git a/docs/data/xrootd/install-storage-element.md b/docs/data/xrootd/install-storage-element.md index 75a8acb04..3c1a7b15f 100644 --- a/docs/data/xrootd/install-storage-element.md +++ b/docs/data/xrootd/install-storage-element.md @@ -276,7 +276,6 @@ Modify `/etc/fstab` by adding the following entries: Replace `/mnt/xrootd` with the path that you would like to access with. -This should also match the GridFTP settings for the `XROOTD_VMP` local path. Create `/mnt/xrootd` directory. Make sure the xrootd user exists on the system. Once you are finished, you can mount it: :::file @@ -342,58 +341,6 @@ root@host # service frm_xfrd start root@host # service frm_purged start ``` -(Optional) Installing a GridFTP Server --------------------------------------- - -The Globus GridFTP server can be installed alongside an XRootD storage element to provide GridFTP-based access to the -storage. - -!!! note "See Also" - OSG has extensive documentation on setting up a GridFTP server; this section is an - abbreviated version documenting the special steps needed for XRootD integration. - You may also find the following useful: - - - [Basic GridFTP Install](../gridftp.md). Additionally covers service planning topics. - - [Load-balanced GridFTP Install](../load-balanced-gridftp.md). Covers the creation of - a load-balanced GridFTP service using multiple servers. - -Prior to following this installation guide, verify the host certificates and networking is configured correctly as in -the [basic GridFTP install](../gridftp.md). - -### Installation - -GridFTP support for XRootD-based storage is provided by the `osg-gridftp-xrootd` meta-package: - -``` console -root@host # yum install osg-gridftp-xrootd -``` - -### Configuration - -For information on how to configure authentication for your GridFTP installation, please refer to the -[configuring authentication section of the GridFTP guide](../gridftp.md#configuring-authentication). - -Edit `/etc/sysconfig/globus-gridftp-server` to set `XROOTD_VMP` to use your XRootD redirector. - - :::bash - export XROOTD_VMP="redirector:1094:/local_path=/remote_path" - -!!! warning - The syntax of `XROOTD_VMP` is tricky; make sure to use the following guidance: - - - **Redirector**: The hostname and domain of the local XRootD redirector server. - - **local_path**: The full local path exported by the GridFTP server. For example `/mystorage/export/data/store` - - **remote_path**: The XRootD path that will be mounted at **local_path**. - -When `xrootd-dsi` is enabled, GridFTP configuration changes should go into `/etc/xrootd-dsi/gridftp-xrootd.conf`, not -`/etc/gridftp.conf`. -Sites should review any customizations made in the latter and copy them as necessary. - -You can use the FUSE mount in order to test POSIX access to xrootd in the GridFTP server. -You should be able to run Unix commands such as `ls /mnt/xrootd` and see the contents of the XRootD server. - -For log / config file locations and system services to run, see the [basic GridFTP install](../gridftp.md). - Using XRootD ------------ diff --git a/docs/detailed-overview.md b/docs/detailed-overview.md index 1d835f69a..c63dd6f13 100644 --- a/docs/detailed-overview.md +++ b/docs/detailed-overview.md @@ -102,10 +102,6 @@ installed [CVMFS](worker-node/install-cvmfs.md) on your worker nodes. - [Install Frontier Squid](data/frontier-squid.md), an HTTP caching proxy service. - Storage element: - - Existing POSIX-based systems (such as NFS, Lustre, or GPFS): - - [Install standalone OSG GridFTP](data/gridftp.md): GridFTP server - - (optional) [Install load-balanced OSG GridFTP](data/load-balanced-gridftp.md): when a single GridFTP server - isn't enough - Hadoop Distributed File System (HDFS): - [Hadoop Overview](data/hadoop-overview.md): HDFS information, planning, and guides - XRootD: From 84be40c0695e02677a7c059f25ecf21470e41c99 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 14:06:41 -0500 Subject: [PATCH 4/6] grid out hadoop (SOFTWARE-5162) --- docs/data/xrootd/install-standalone.md | 21 --------------------- docs/detailed-overview.md | 2 -- 2 files changed, 23 deletions(-) diff --git a/docs/data/xrootd/install-standalone.md b/docs/data/xrootd/install-standalone.md index 50c170881..a0c7c33ca 100644 --- a/docs/data/xrootd/install-standalone.md +++ b/docs/data/xrootd/install-standalone.md @@ -120,27 +120,6 @@ The following configuration steps are optional and will likely not be required f If you do not need any of the following special configurations, skip to [the section on using XRootD](#using-xrootd). -#### Enabling Hadoop support (deprecated, EL 7 Only) - -!!! info "OSG 3.5 end-of-life" - Hadoop is no longer supported in OSG 3.6 and OSG 3.5 will reach its end-of-life at the - [beginning of May 2022](../../release/release_series.md#series-overviews). - -Hadoop File System (HDFS) based sites should utilize the `xrootd-hdfs` plugin to allow XRootD to access their storage: - -1. Install the XRootD HDFS plugin package: - - :::console - root@host # yum install xrootd-hdfs - -1. Add the following configuration to `/etc/xrootd/xrootd-clustered.cfg`: - - :::file - ofs.osslib /usr/lib64/libXrdHdfs.so - -For more information, see [the HDFS installation documents](../install-hadoop.md). - - #### Enabling multi-user support !!! warning "Requirements for XRootD-Multiuser with VOMS FQANs" diff --git a/docs/detailed-overview.md b/docs/detailed-overview.md index c63dd6f13..71754ad00 100644 --- a/docs/detailed-overview.md +++ b/docs/detailed-overview.md @@ -102,8 +102,6 @@ installed [CVMFS](worker-node/install-cvmfs.md) on your worker nodes. - [Install Frontier Squid](data/frontier-squid.md), an HTTP caching proxy service. - Storage element: - - Hadoop Distributed File System (HDFS): - - [Hadoop Overview](data/hadoop-overview.md): HDFS information, planning, and guides - XRootD: - [XRootd Overview](./data/xrootd/overview.md): XRootD information, planning, and guides - [Install XRootD Server](./data/xrootd/install-storage-element.md): XRootD redirector installation From 9301f290e6a2df1a0e5f27da533ad92ed544fd09 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 14:10:10 -0500 Subject: [PATCH 5/6] grind out user-certs (SOFTWARE-5162) --- docs/security/certificate-management.md | 2 +- docs/security/host-certs/digicert.md | 1 - docs/security/host-certs/overview.md | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/security/certificate-management.md b/docs/security/certificate-management.md index 32d05bd40..181e1fdd4 100644 --- a/docs/security/certificate-management.md +++ b/docs/security/certificate-management.md @@ -13,7 +13,7 @@ reference guide for several of these tools: !!! note This is a reference document and not introduction on how to install CA certificates or request host / user certificates. Most users will want the [CA overview](../common/ca.md), - [host certificate overview](host-certs.md), or [user certificate overview](user-certs.md) documents. + [host certificate overview](host-certs.md). OSG PKI Command Line Clients diff --git a/docs/security/host-certs/digicert.md b/docs/security/host-certs/digicert.md index 879fe19de..c9355f45d 100644 --- a/docs/security/host-certs/digicert.md +++ b/docs/security/host-certs/digicert.md @@ -8,7 +8,6 @@ DigiCert IGTF Host Certificates !!! note This document describes how to get **host** certificates. - For instructions on how to get **user** certificates, see the [User Certificates document](../user-certs.md). This document describes how to purchase individual IGTF-accredited host certificates from [DigiCert](https://www.digicert.com/). Before purchasing individual certificates, consider the following alternatives: diff --git a/docs/security/host-certs/overview.md b/docs/security/host-certs/overview.md index 4ce2b08e1..38f876393 100644 --- a/docs/security/host-certs/overview.md +++ b/docs/security/host-certs/overview.md @@ -5,7 +5,6 @@ Host Certificates !!! note This document describes how to get **host** certificates. - For instructions on how to get **user** certificates, see the [User Certificates document](../user-certs.md). Host certificates are [X.509 certificates](https://en.wikipedia.org/wiki/X.509) that are used to securely identify servers and to establish encrypted connections between services and clients. From ca92ad5a93383ee9ac330a1a1b8dcc546bf2b95b Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 6 May 2022 14:16:41 -0500 Subject: [PATCH 6/6] replace one lcmaps-voms-authentication.md reference (SOFTWARE-5162) --- docs/compute-element/covid-19.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/compute-element/covid-19.md b/docs/compute-element/covid-19.md index 9e150f7b0..3971522ea 100644 --- a/docs/compute-element/covid-19.md +++ b/docs/compute-element/covid-19.md @@ -21,7 +21,7 @@ To support COVID-19 work, the overall process includes the following: If neither solution is viable, or you'd like to discuss the options, please send email to and we'll work with you to arrive at the best solution. - If you already provide resources through an OSG Hosted CE, skip to [this section](#requesting-covid-19-jobs). -1. [Enable the OSG VO](../security/lcmaps-voms-authentication.md#configuring-the-lcmaps-voms-plugin) on your HTCondor-CE. +1. [Enable the OSG VO](install-htcondor-ce.md#configuring-authentication) on your HTCondor-CE. 1. Setup a job route specific to COVID-19 pilot jobs (documented below). The job route will allow you to prioritize these jobs using local policy in your site's cluster. 1. (Optional) To attract more user jobs, install [CVMFS](../worker-node/install-cvmfs.md) and