In [None]:
# endpoints.txt (one per line)
# api1.example.com
# api2.example.com
# api3.example.com
# api4.example.com
# api5.example.com
# api6.example.com
# api7.example.com
# api8.example.com
# api9.example.com
# api10.example.com

# Assume these are the p95_ms values returned by each host's /metrics JSON:
# api1: 120, api2: 180, api3: timeout, api4: 150, api5: 90,
# api6: 210, api7: 200, api8: 160, api9: 140, api10: 170
#
# Rule for this exercise:
# - timeouts count as missing data (skip them, but report how many timeouts)
# - ALERT if avg_p95_ms >= threshold_ms
#
# With threshold_ms=160:
# avg of [120,180,150,90,210,200,160,140,170] = 1420/9 = 157.78 => OK
# Output:
# OK: avg_p95_ms=157.8 threshold_ms=160 samples=9 timeouts=1
#
# With threshold_ms=150:
# ALERT (157.8 >= 150)
# ALERT: avg_p95_ms=157.8 threshold_ms=150 samples=9 timeouts=1

In [None]:
import sys
import json
import subprocess

def fetch_metrics(host, timeout="2"):
    # Uses curl to simplify (pretend curl is available)
    url = "https://" + host + "/metrics"
    out = subprocess.check_output(["curl", "-sS", "--max-time", timeout, url], text=True)
    return json.loads(out)

def main(argv):
    if len(argv) != 2:
        raise ValueError("usage: p95mon.py <endpoints_file> <threshold_ms>")

    path = argv[0]
    threshold = argv[1]

    values = []
    timeouts = 0

    with open(path) as f:
        for line in f:
            host = line.strip()
            if not host or host.startswith("#"):
                continue
            try:
                data = fetch_metrics(host)
                p95 = data.get("p95_ms", "0")
                values.append(p95)
            except subprocess.CalledProcessError:
                timeouts += 1

    avg = sum(values) / len(values)

    if avg >= threshold:
        print(f"ALERT: avg_p95_ms={avg:.1f} threshold_ms={threshold} samples={len(values)} timeouts={timeouts}")
    else:
        print(f"OK: avg_p95_ms={avg:.1f} threshold_ms={threshold} samples={len(values)} timeouts={timeouts}")

if __name__ == "__main__":
    main(sys.argv[1:])

In [None]:
# # endpoints.txt (one per line)
# api1.example.com
# api2.example.com
# api3.example.com
# api4.example.com
# api5.example.com
# api6.example.com
# api7.example.com
# api8.example.com
# api9.example.com
# api10.example.com

# Assume these are the p95_ms values returned by each host's /metrics JSON:
# api1: 120, api2: 180, api3: timeout, api4: 150, api5: 90,
# api6: 210, api7: 200, api8: 160, api9: 140, api10: 170
#
# Rule for this exercise:
# - timeouts count as missing data (skip them, but report how many timeouts)
# - ALERT if avg_p95_ms >= threshold_ms
#
# With threshold_ms=160:
# avg of [120,180,150,90,210,200,160,140,170] = 1420/9 = 157.78 => OK
# Output:
# OK: avg_p95_ms=157.8 threshold_ms=160 samples=9 timeouts=1
#
# With threshold_ms=150:
# ALERT (157.8 >= 150)
# ALERT: avg_p95_ms=157.8 threshold_ms=150 samples=9 timeouts=1

import sys
import json
import subprocess

def fetch_metrics(host, timeout="2"): #this takes the host and default value of timeout in "str""
    # Uses curl to simplify (pretend curl is available)
    url = "https://" + host + "/metrics" #string concatination
    
    
    out = subprocess.check_output(["curl", "-sS", "--max-time", timeout, url], text=True) #text=True gives us python string output, 
    #1iter: url="https://api1.example.com/metrics"
    
    #for handling exception from timeouts, it is handled in the main fn
    
    
    return json.loads(out) #returns a output with is a dict

def main(argv):
    if len(argv) != 2: #we are assuming 2 args of length, if not raise error
        raise ValueError("usage: p95mon.py <endpoints_file> <threshold_ms>")

    path = argv[0] #""
    threshold = float(argv[1]) # ""

    values = [] #
    timeouts = 0 #int

    with open(path) as f: #it does not have permissions like r,r+,w
    #(PP1* - write a try, except blocks to handle missing file with a FileNotFound exception)
        for line in f: # api1.example.com
            host = line.strip() #strips empty spaces before and after
            if not host or host.startswith("#"): #this handles/skips empty lines or lines with #
                continue
            try:
                data = fetch_metrics(host) #calling helper function
                # api1: 120, api2: 180, api3: timeout, api4: 150, api5: 90,
                
                
                p95 = data.get("p95_ms", 0) #we are trying to get the value in the key p95, and also given a default value of "0" STR, so I made 0 as int
                
                
                values.append(p95) #value=[120,180,150,190....] #there wont be timeouts in the list because we are handling using except below
            except subprocess.CalledProcessError:
                timeouts += 1 #1 as per assumed input data

    avg = sum(values) / len(values) #here the length could be zero if the list is empty. so do, if len(values) > 0: then do the avg operation

    if avg >= threshold:
        print(f"ALERT: avg_p95_ms={avg:.1f} threshold_ms={threshold} samples={len(values)} timeouts={timeouts}")
        #ALERT: avg_p95_ms=(prints avg with 1 decimal) threshold_ms=(prints threshold given in command) samples=(prints number of values) timeouts=(final number of timeouts)
    else:
        print(f"OK: avg_p95_ms={avg:.1f} threshold_ms={threshold} samples={len(values)} timeouts={timeouts}")
        #similarly like above ALERT comment

if __name__ == "__main__":
    main(sys.argv[1:]) #slice first two args of command and take the rest
    


You’re on the right track here — this one is *exactly* the kind of snippet where datatype discipline matters, and you fixed the most obvious one.

## What you did well ✅

* ✅ Converted `threshold` to `float(argv[1])` (original would compare float to str and crash).
* ✅ Changed default `p95_ms` to numeric `0` (better than `"0"`).
* ✅ Noted the divide-by-zero risk when `values` is empty.

## What you missed (what the interviewer expects)

### 1) Major bug (types inside `values`)

Even after your changes, `p95` might still be a **string** coming from JSON (common: `"p95_ms": "120"`). Then:

```python
avg = sum(values) / len(values)
```

will crash if `values` contains strings, or produce nonsense if mixed.

**Fix:** force numeric:

```python
p95 = float(data.get("p95_ms"))
values.append(p95)
```

and handle missing/null with guard.

### 2) Logic error (timeouts classification)

The code assumes:

```python
except subprocess.CalledProcessError:
    timeouts += 1
```

But `curl --max-time` timeout usually returns exit code **28**, but `CalledProcessError` also happens for many other things (DNS failure, SSL, connection refused depending on flags). So counting *all* CalledProcessError as “timeouts” is a logic error.

**Fix:** check the return code or stderr text and classify properly:

* if returncode == 28 → timeout
* else → failure (could count separately or treat as skip, depending on requirement)

### 3) Edge case where it breaks

You already identified one: **all endpoints fail/timeout** → `len(values)==0` → division by zero.

Another common edge case: invalid JSON (HTML error page, proxy block page) → `json.loads(out)` raises `JSONDecodeError` (not caught).

## One more small thing

In your final comment:

```python
main(sys.argv[1:])  # slice first two args
```

That comment is slightly off: `sys.argv[1:]` removes only the program name, not “first two args.” (Minor, but avoid confusing yourself.)

---

## Rating: **7/10**

You fixed the `threshold` type issue and spotted empty-list risk. To hit 8–9, you need to consistently:

* cast JSON numbers to float/int,
* distinguish timeout vs other curl failures,
* handle invalid JSON.

---

## What a clean “TSE answer” would look like (high-level patch)

* Convert threshold to float ✅
* Convert p95 to float (and skip if missing)
* Handle:

  * `CalledProcessError` returncode 28 as timeout, others as `failures`
  * `JSONDecodeError`
* Guard zero samples

Example minimal change set:

```python
threshold = float(argv[1])

...
try:
    data = fetch_metrics(host)
    p95_raw = data.get("p95_ms", None)
    if p95_raw is None:
        continue
    values.append(float(p95_raw))
except subprocess.CalledProcessError as e:
    if e.returncode == 28:
        timeouts += 1
    else:
        failures += 1
except json.JSONDecodeError:
    failures += 1

if not values:
    print(f"ALERT: no_samples threshold_ms={threshold} timeouts={timeouts}")
    return
```

---

If you want the next one (same difficulty, still network/TSE), I’ll give a snippet where the bug is **HTTPError vs URLError ordering + mis-parsing status codes**, with ~12 inputs and a threshold calculation again.
