# BMW Flask API installation:

update bashrc for qq user:

In [None]:
#run once:

cleantext="
export HISTTIMEFORMAT="[%Y-%m-%d %H:%M:%S] "
HISTSIZE='INFINITY'; HISTFILESIZE='ANDBEYOND'

PS1='\e[37m\D{%H:%M}\e[91m[\e[90m\u@\h \e[33m\w\e[31m]\e[92m\n\$'

alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CmF'
alias lr='ls -ltrh'
alias ufind="find / -name $1 2>/dev/null"

export PATH=$PATH:/home/${USER}/scripts
"

echo "$cleantext" >> /home/${USER}/.bashrc.sh

In [None]:
# Check OS vesion:
cat /etc/os-release

(below commands are for RHEL distro)

In [None]:
# install prereqs with elevated user:

# prod:
yum install -y libXcomposite libXcursor libXi libXtst libXrandr alsa-lib mesa-libEGL libXdamage mesa-libGL libXScrnSaver bc

# dev:
subscription-manager repos --enable=codeready-builder-for-rhel-8-x86_64-rpms # required for x11
yum install -y libXcomposite libXcursor libXi libXtst libXrandr alsa-lib mesa-libEGL libXdamage mesa-libGL libXScrnSaver bc xorg-x11-apps xorg-x11-xauth firefox

[Within VDI]
> download <mark>Anaconda3-2020.02-Linux-x86_64.sh</mark> from https://repo.anaconda.com/archive/ 

> download & install WinSCP from http://wuss.bmwgroup.net

> upload Anaconda3-2020.02-Linux-x86_64.sh to <mark>Telegraf host</mark> to /tmp or /home/qqky020/UI/install_files (but first create the folder for that - see below)


[Telegraf host]

In [None]:
# create folders:
mkdir -p  /home/${USER}/UI/Flask ~/UI/install_files
# give execute rights to the installed:
chmod +x Anaconda3-2020.02-Linux-x86_64.sh
# Long press Enter; then "yes"; then "yes" (again)
# when completed - reload shell:
. ~/.bashrc
# install requirements for Flask API (maybe you need to update the path to requirements folder: check "flask_wapi_UAT")
cd  ~/UI/flask_wapi_UAT/requirements; `for': for file in $(ls) ; do pip install ./${file}; done  
    # check if all are required (wheel; tar.gz.. some might be duplicates)
. ../.flaskenv
. ../.flask run
# dev/test instance
flask run --host 0.0.0.0  
# prod
IP="$(hostname -I | awk '{print $1}')"
nohup flask run --host $IP &
# to close it type "fg" and press ctrl+c; #or "kill %1" #but be sure that is the only background process that is running!

[VDI]
> open preferred web browser

> open the UI: http://<mark>XX.X.XX.XX</mark>:8000 #replace with correct IP address <mark># note that the firewall port 8000 has to be opened!</mark>

[Telegraf host]

Backend & <b>cronjobs</b>: <mark>monitoring_services.sh</mark>

In [None]:
# create backend script:
vi monitoring_services.sh

##### paste the below // current version = v1.06; 2022.08.18 (Author: Michal Márkus)

In [None]:
#!/bin/bash
# monitoring_services.sh


# TIMER -start
res1=$(date +%s.%N)
# measure runtime of this script

DATE=`date +'%m/%d/%Y %H:%M:%S'`; EPOCHNOW=`date -d "${DATE}" +"%s"`
err_msg="not running @$DATE"
ok_msg="OK @$DATE"

flask_path=/home/qqky020/UI/flask_wapi_UAT
cd $flask_path



# Define function to check service status:
services_check()
{

# ACTIVE_IQ
    # not defined yet

# GRAFANA:
    grafana_status()
    {
        #grafana_url="https://grafana.apps.kynprodocp.bmwgroup.net/api/health"  # PROD (port is 443)
        grafana_url="http://itahdnasrep.bmwgroup.net:3000/api/health"  # UAT
        grafana_check="$(curl -s $grafana_url | grep -oh [[:alpha:]]*ok[[:alpha:]]*)"  # checks if status is "ok"
        grafana_latency=$(curl -s -w 'Establish Connection: %{time_connect}s\nTTFB: %{time_starttransfer}s\nTotal: %{time_total}s\n' itahdnasrep.bmwgroup.net:3000/ping/api/health | egrep "Total: [1-9]") ;  # checks if latency is above 1 second
        # log status:
        [ $(eval echo \$"${service}_check") == 'ok' ] && [ -z "$latency" ] || echo -e "\n$DATE\n$grafana_check\n\n###" >> ${service}_high_latency.log && echo "${service} $ok_msg" >> ${service}_uptime.log || echo "${service}" $err_msg >> ${service}_uptime.log
        export grafana_latency
    }

# HARVEST:

# QQ USER NEEDS TO BE ADDED TO REMOTE HOST & set up PWLESS SSH (or info needs to be posted to this host via Ansible...)
    harvest_status()
    {
        harvest_check="$(ssh -tt michal@itahdnasuathar.bmwgroup.net 'systemctl status harvest')"  # to be replaced with qq user!
        if [ "$(eval echo \$${service}_check) | sort -u | grep -v running | wc -l)" -gt 0 ]; then echo "${service}" $err_msg  >> ${service}_uptime.log; else echo "${service} $ok_msg";fi
    }

# INFLUX:
    influx_status()
    {
        #influx_url="https://influxdb.apps.kynprodocp.bmwgroup.net/health"  # PROD (port is 443)
        influx_url="http://itahdnasrep.bmwgroup.net:8086/health"  # UAT
        influx_check="$(curl -s $influx_url | grep status |  grep -oh [[:alpha:]]*pass[[:alpha:]]*)"  # check if status is "pass"
        influx_latency=$(curl -s -w 'Establish Connection: %{time_connect}s\nTTFB: %{time_starttransfer}s\nTotal: %{time_total}s\n' itahdnasrep.bmwgroup.net:8086 | egrep "Total: [1-9]")
        [ $(eval echo \$"${service}_check") == 'pass' ] && [ -z "$latency" ] || echo -e "\n$DATE\n${service}_check\n\n###" >> ${service}_high_latency.log && echo "${service} $ok_msg" >> ${service}_uptime.log || echo "${service}" $err_msg  >> ${service}_uptime.log
        export influx_latency
    }

# NodeRed
    # not defined yet

# TELEGRAF:
    telegraf_status()
    {
        telegraf_check="$(systemctl | grep telegraf | sort -u | grep -v running | wc -l)"
        if ! [ "$(echo $telegraf_check)" == 0 ]; then echo "${service}" $err_msg >> ${service}_uptime.log; else echo "${service} $ok_msg" >> ${service}_uptime.log; fi
        export telegraf_check
    }

# LOOP OVER SERVICES:
for service in grafana influx telegraf #active_iq harvest nodered
    do
        ${service}_status
    done
}



# send ticket if service is down for 5 consecutive minutes
ticket()
{

# ticket details:
if
    service=grafana; then
    eventID="123456..."
    resource="itahdnasrep"  # UAT
    #state=$(serviceup5min || echo  "OK")
    state=$grafana_check
    latency=$grafana_latency
    severity="1"
    header="Date, Service, Status, EventID, Resource, Severity"
    message="$DATE, $service; $state; $eventID, $resource, $severtiy"
elif
    service=influx; then
    eventID="123456..."
    resource="itahdnasrep"  # UAT
    state=$influx_check
    latency=$influx_latency
    severity="1"
    header="Date, Service, Status, EventID, Resource, Severity"
    message="$DATE, $service; $state; $eventID, $resource, $severtiy"
elif
    service=telegraf; then
    eventID="123456..."
    resource="itahdnasrep"  # UAT
    state=$telegraf_check
    #telegraf_jobs="$telegraf_check"
    severity="1"
    header="Date, Service, Status, EventID, Resource, Severity"
    message="$DATE, $service; $state; $eventID, $resource, $severtiy"
    #...
fi


    # Send info to NodeRed when service is down:
    serviceup5min()
        {
        c=0
        for((i=1;i<=5;++i))
            do
            stat=$(tac ${service}_uptime.log | sed -n "${i},1p")
            dat=$(echo $stat| cut -d@ -f2)
                case $stat in
                OK) return 1 ;;
                not)
                epoch_dat=`date -d "${dat}" +"%s"`
                    if [ "$(echo $EPOCHNOW-$epoch_dat|bc)" -le "360"  ] # less or equal to 360 seconds AKA 6 min (5min +1min grace time due to latency)
                    then c=$((c+1))
                    export c
                    # if [ "$c" == 1 ]; then echo ${service}_down_since "$epoch_dat"; fi
                fi ;;
                esac
            done
        }


    create_ticket()
        {
            file=${service}_monitoring_ticket_`date +\%Y\%m\%d\%H\%M`.json
            if [[ $c -ge 5 ]]; then echo -e "$header"\n"$message" > $file; else echo $DATE $service - OK; fi
        }

    for service in grafana influx telegraf #harvest #ansible nodered
        do
            serviceup5min && create_ticket
        done



    send_ticket()
        {
        echo
        # *NEEDS TO BE DISCUSSED*
        # POST json TO NODERED
        }

    #send_ticket

}

manage_logs()
    {
        # GENERATE UPTIME LOGS FOR FLASK
        service_uptime()  # Grafana & Influx
        {
            if [[ "$service" = "telegraf" ]]; then
                telegraf_sub_service_upt()
                    {
                    #set -x
                    stat=`systemctl status telegraf_${sub}.service`
                    [[ $(echo "$stat" | grep "running") == *running* ]] && r="Running" || r="Down"
                    since=$(echo $stat | grep -Po ".*; \K(.*)(?= ago)")
                    epoch_since=$(date --date="$since" +"%s")
                    uptime_seconds=`echo $epoch_since - $EPOCHNOW | bc`
                    uptime=$(echo $uptime_seconds/60|bc)  # minutes
                    
                    echo -e "$r $uptime minutes" > telegraf_${sub}_uptime.txt
                    }

                for sub in broadcom cisco esx storage system traps
                do
                    telegraf_sub_service_upt
                done

                else
                    last=$(tac ${service}_uptime.log | grep -A1 -m 1 "not")  # sample: grafana OK @08/11/2022 17:28:03
                    up=$(echo "$last" | tail -1)
                    epoch_up=`date -d "$(echo $up | cut -d@ -f2)" +"%s"`
                    down=$(echo "$last" | head -1)
                    epoch_down=`date -d "$(echo $down | cut -d@ -f2)" +"%s"`
                    prev_down=$(tac ${service}_uptime.log | grep -m 2 "not" | tail -1)
                    epoch_prev_down=`date -d "$(echo $prev_down | cut -d@ -f2)" +"%s"`
                    seconds=`echo "$epoch_down"-"$epoch_prev_down"|bc`

                    if [[ "$seconds" -eq 0 ]]
                        then echo "UNKNOWN" > ${service}_uptime.txt
                    else
                        downtime_minutes=$(echo $seconds/60|bc)
                        service_uptime=`echo $(echo "$EPOCHNOW"-"$epoch_up"|bc)/60|bc`

                        echo -e "Down: $down\nUp:$up\nOutage Time: $last_outage minutes\nUptime: service_uptime" > ${service}_up_since.txt
                    fi
            fi
        }


       for service in grafana influx telegraf #.....
        do
          service_uptime
        done


        past_incidents()
        {
            for T in DAYS WEEKS MONTH;
            do declare t=${T,,};
                RANGE=$(date -d "$date -1 ${t}" +"%s");

                ls *uptime.log | xargs cat | grep -v OK | sort -u | while read line;
                do
                    x=$(echo $line |cut -d@ -f2)
                    if ! [[ $x == '' ]]; then
                        y=$(date -d "$x" +"%s")
                        if [ "$RANGE" -le "$y" ]; then  echo $line >> incidents_${t}.csv; fi
                    fi
                done
            done


        mv incidents_days.csv today.csv 2>/dev/null
        mv incidents_weeks.csv weekly.csv 2>/dev/null
        mv incidents_month.csv montly.csv 2>/dev/null
        }

    # call past incidents subfunction
    past_incidents

}

# 2 szintu maintenance check:
#        1# local offlne lekérdezés napi 1x
#        2# kikuldés előtt  live ellenőzés
#        -- PYTHON SCRIPTTEL FOGOM CSINÁLNI... mert cockpitbol pandassal egyszerűbb lekezelni az adatokat


# Run main parts of the script:
services_check
ticket
manage_logs


# TIMER STOP (calculate runtime):
res2=$(date +%s.%N)
dt=$(echo "$res2 - $res1" | bc)
dd=$(echo "$dt/86400" | bc)
dt2=$(echo "$dt-86400*$dd" | bc)
dh=$(echo "$dt2/3600" | bc)
dt3=$(echo "$dt2-3600*$dh" | bc)
dm=$(echo "$dt3/60" | bc)
ds=$(echo "$dt3-60*$dm" | bc)
echo
printf "script run for: %d:%02d:%02d:%02.4f\n" $dd $dh $dm $ds
echo

exit 0


###########################################################################################################
#  PENDING:

#--Sandor qq user harvest status lekérdezéshez
#--Send info to NodeRed
#--Michael Flesh maintenance  (ha nem csv akkor tud e adni accesst MSSQL)
    
###########################################################################################################  


In [None]:
# Edit the crontab via:
crontab -e
# paste the following (this script runs every minute):
*/1 * * * * source ~/.bashrc; /home/qq_XX/monitoring_services.sh

## 2022.08.25: STATUS for P1: `80% #progress`

### P1 (pending items):

> <b>monitoring_services.sh</b> `[1,5%]`: generate_report() --json part; <b>UI</b>: 


> Enable passwordless ssh for qq user; or copy required files ('systemctl status harvest' output; maintenance.csv [resource,service,start,end] from NodeRed to telegraf host) `[5%]`

--currently working on this:
> Maintenance mode `[10%]` ; check Active IQ export; update monitoring_services.sh & UI (*check_service.py subprocesses) for maintenance


> `+5% extra` <mark>UPTIME</mark> reporting via `curl` or telegraf plugin... | backend completed @2022.08.22; need to add UI elements

#### P1 completed items:
> Check/debug: <b>monitoring_services.sh</b> `[1,5%]` @2022.08.09

> infra.html ` [2%]` @2022.08.09

> 'Past Incidents' tab: add uptime `[5%]` @2022.08.18 # completed in terms of backend; need to add to UI as well...

#### P2 plan:
- Ansible jobs 
- Performance metrics/alerts/jobs...
- Add http<mark>s</mark> & cert to flask

#### P3 plan:
- CSS formatting
- Check redundant packages at <mark>requirements</mark> & check which modules are being used, so that on prod we don't have to install anaconda (minimalistic approach)
- Configure <mark>X11</mark> for jupyter on telegraf host

Decommission:

rm -rf ~/anaconda3  # also remove anaconda stuff from ~/.bashrc

### ODBC config for maintenance & reporting

###### - Bash command to connect to mysql database:
> (server, username and password is omitted here - replace with real values!)


$cat query_maint
> #!/bin/bash
isql -k "DRIVER={ODBC Driver 18 for SQL Server};SERVER=XXX,1433;UID=XXX;PWD=XXX;Authentication=SqlPassword;TrustServerCertificate=Yes" -v -b -d, -q < /home/qqky020/scripts/maint.sql

$cat maint.sql
> SELECT * FROM BMW_Common_View.monitoring.V_Infrastructure_Maintenance

$query_maint  # output:
> 1,"Component_Grafana","Grafana",2022-08-17 00:00:00.0000000,2022-08-18 00:00:00.0000000,"12324","test chl"



###### - How to export data (to export in html form add "-w" after isql (in 'query' script file)
query < maint.sql  # >/dev/null 2>&1



###### - Proxy for pacakge installation
> pip install <package>  --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"  #HTTP PROXY

> pip install <package>  --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"  #HTTPS PROXY

> pip install <package>  --proxy ".bmwgroup.net" #NOPROXY

#### Requirements (after installing miniconda)
```
sudo yum install unixODBC-devel
sudo yum -y install gcc gcc-c++ kernel-devel
sudo yum -y install python-devel libxslt-devel libffi-devel openssl-devel
pip install Flask --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"
pip install python-dotenv  --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"
pip install pandas --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"
pip install pyodbc --proxy "http://qqky010:Kyndryl&BMW2022@192.109.190.88:8080"```

# Tool configurations

# Telegraf

### Info from Florian:
---
```
I prepared the Grid configuration already, so that the Telegraf can start to collect the data. 
Please take care that it can take up to 15 minutes with the initial data collection till it get reflected into the InfluxDB with the “prometheus” _measurements. 

StorageGRID requires a certificate authentication, so in addition I attached you the required certificates. 
Move them in the /etc/telegraf directory or subdirectory (modify tls_ca/tls_ca_cert & tls_key path in this case).

There are 3 configuration parts to be modified / checked. 

#1  modify the common Telegraf config (at the beginning of the config file)
[agent]
   interval = “60s”
   metric_batch_size = 5000
   metric_buffer_limit = 75000


#2  add the Storagegrid Input config
 [[inputs.prometheus]] 
   urls = ['https://10.2.62.68:9091/federate?match%5B%5D=%7Bjob%3D~%22.%2B%22%7D']
   metric_version = 2
   tls_ca = "/etc/telegraf/cacert.pem"
   tls_cert = "/etc/telegraf/cert.pem"
   tls_key = "/etc/telegraf/key.pem"
   insecure_skip_verify = true
   response_timeout = "59s"


#3 check your [outputs.influxdb_v2]] configuration. 
Telegraf will write the data into the according bucket you set here. 


After this restart the Telegraf (via cmd # sudo systemctl stop telegraf & # sudo systemctl start telegraf). 
15 Minutes after this, the InfluxDB will reflect the StorageGRID data.
``` 

> Source: mail @Fri 22/05/13 13:11


## UAT Telegraf config steps
Telegraf configuration
Telegraf agents located on: <b>ITAHDNASUATTEL</b>
There is 5 instance running
- SNMP trap receiver
- SNMP query for Cisco Switches
- VM server data receiver
installation folder:
`/etc/telegraf`
each telegraf has its own service
#### sytemctl status telegraf_broadcom.service
`/usr/bin/telegraf -config /etc/telegraf/telegraf_broadcom.conf -config-directory /etc/telegraf/telegraf_broadcom`
#### sytemctl status telegraf_cisco.service
`/usr/bin/telegraf -config /etc/telegraf/telegraf_cisco.conf -config-directory /etc/telegraf/telegraf_cisco`
#### sytemctl status telegraf_esx.service
`/usr/bin/telegraf -config /etc/telegraf/telegraf_esx.conf -config-directory /etc/telegraf/telegraf_esx`
#### sytemctl status telegraf_storage.service 
`/usr/bin/telegraf -config /etc/telegraf/telegraf_storage.conf -config-directory /etc/telegraf/telegraf_storage`
#### sytemctl status telegraf_traps.service
`/usr/bin/telegraf -config /etc/telegraf/telegraf_traps.conf -config-directory /etc/telegraf/telegraf_traps`
#### sytemctl status telegraf_system.service
`/usr/bin/telegraf -config /etc/telegraf/telegraf_system.conf -config-directory /etc/telegraf/telegraf_system`

To receive SNMP traps from AIQ UM two MIB file required to copied to the configured path where the MIB's name are important
- NETAPP.MIB
- OCUM.MIB (this is a renamed aiqum_9.9.mib)

# Influx

## Install Influx CLI and Modify <mark>bucket's retention</mark>
Install Influx CLI/Modify bucket's retention:
Download package from the following URL: https://docs.influxdata.com/influxdb/cloud/tools/influx-cli/?t=Windows
Install CLI to VDI: Because we haven't permission on the 'C:\Program Files' folder, need modify the original command:
Ori: Expand-Archive .\influxdb2-client-2.3.0-windows-amd64.zip -DestinationPath 'C:\Program Files\InfluxData' mv 'C:\Program Files\InfluxData\influxdb2-client-2.3.0-windows-amd64' 'C:\Program Files\InfluxData\influx'
Modified: Expand-Archive .\influxdb2-client-2.3.0-windows-amd64.zip -DestinationPath 'C:\InfluxData' mv 'C:\InfluxData\influxdb2-client-2.3.0-windows-amd64' 'C:\InfluxData\influx'
Use Powershell for the following
Before issuing the above command, navigate to the folder where you downloaded the CLI package. For example:
```
cd C:\Users"USERNAME"\Downloads`
mkdir C:\InfluxData`
Expand-Archive .\influxdb2-client-2.3.0-windows-amd64.zip -DestinationPath 'C:\InfluxData'
mv 'C:\InfluxData\influxdb2-client-2.3.0-windows-amd64' 'C:\InfluxData\influx'
Navigate to the C:\InfluxData\influx // because we cannot modify the 'path' variable, need to go to the folder where the influx.exe exists
Create an influx CLI's config for the remote host: .\influx config create -a -n CONFIGNAME -u URL -t TOKEN_WHICH_HAS_PROPER_PRIVILEGES -o ORGANIZATION
List bucket's current settings:
PS C:\InfluxData\influx> .\influx.exe bucket list ID Name Retention Shard group duration Organization ID Schema Type 834ba3f797c35789 BroadcomBES 1440h0m0s 24h0m0s f24c8a8d0e5f36e8 implicit f827bf73e326118b CiscoBackend 1440h0m0s 24h0m0s f24c8a8d0e5f36e8 implicit
Modify bucket's retention: Command reference: https://docs.influxdata.com/influxdb/v2.2/organizations/buckets/update-bucket/
.\influx bucket update -i BUCKET_ID -r NEW_RETENTION_TIME
```
Done
