Skip to content

Commit

Permalink
mac80211: add AQL support for broadcast/multicast packets
Browse files Browse the repository at this point in the history
Should improve performance/reliability with lots of mcast packets

Signed-off-by: Felix Fietkau <nbd@nbd.name>
  • Loading branch information
nbd168 committed Feb 21, 2024
1 parent fb45887 commit 95e633e
Showing 1 changed file with 226 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 9 Feb 2024 19:43:40 +0100
Subject: [PATCH] mac80211: add AQL support for broadcast packets

Excessive broadcast traffic with little competing unicast traffic can easily
flood hardware queues, leading to throughput issues. Additionally, filling
the hardware queues with too many packets breaks FQ for broadcast data.
Fix this by enabling AQL for broadcast packets.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---

--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3324,6 +3324,7 @@ enum wiphy_params_flags {
/* The per TXQ device queue limit in airtime */
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
+#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000

/* The per interface airtime threshold to switch to lower queue limit */
#define IEEE80211_AQL_THRESHOLD 24000
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -215,11 +215,13 @@ static ssize_t aql_pending_read(struct f
"VI %u us\n"
"BE %u us\n"
"BK %u us\n"
+ "BC/MC %u us\n"
"total %u us\n",
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VO]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VI]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BE]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BK]),
+ atomic_read(&local->aql_bc_pending_airtime),
atomic_read(&local->aql_total_pending_airtime));
return simple_read_from_buffer(user_buf, count, ppos,
buf, len);
@@ -245,7 +247,8 @@ static ssize_t aql_txq_limit_read(struct
"VO %u %u\n"
"VI %u %u\n"
"BE %u %u\n"
- "BK %u %u\n",
+ "BK %u %u\n"
+ "BC/MC %u\n",
local->aql_txq_limit_low[IEEE80211_AC_VO],
local->aql_txq_limit_high[IEEE80211_AC_VO],
local->aql_txq_limit_low[IEEE80211_AC_VI],
@@ -253,7 +256,8 @@ static ssize_t aql_txq_limit_read(struct
local->aql_txq_limit_low[IEEE80211_AC_BE],
local->aql_txq_limit_high[IEEE80211_AC_BE],
local->aql_txq_limit_low[IEEE80211_AC_BK],
- local->aql_txq_limit_high[IEEE80211_AC_BK]);
+ local->aql_txq_limit_high[IEEE80211_AC_BK],
+ local->aql_txq_limit_bc);
return simple_read_from_buffer(user_buf, count, ppos,
buf, len);
}
@@ -279,6 +283,11 @@ static ssize_t aql_txq_limit_write(struc
else
buf[count] = '\0';

+ if (sscanf(buf, "mcast %u", &q_limit_low) == 1) {
+ local->aql_txq_limit_bc = q_limit_low;
+ return count;
+ }
+
if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
return -EINVAL;

--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1328,10 +1328,12 @@ struct ieee80211_local {
spinlock_t handle_wake_tx_queue_lock;

u16 airtime_flags;
+ u32 aql_txq_limit_bc;
u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
u32 aql_threshold;
atomic_t aql_total_pending_airtime;
+ atomic_t aql_bc_pending_airtime;
atomic_t aql_ac_pending_airtime[IEEE80211_NUM_ACS];

const struct ieee80211_ops *ops;
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -788,6 +788,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_
spin_lock_init(&local->rx_path_lock);
spin_lock_init(&local->queue_stop_reason_lock);

+ local->aql_txq_limit_bc = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC;
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
INIT_LIST_HEAD(&local->active_txqs[i]);
spin_lock_init(&local->active_txq_lock[i]);
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2341,28 +2341,27 @@ void ieee80211_sta_update_pending_airtim
struct sta_info *sta, u8 ac,
u16 tx_airtime, bool tx_completed)
{
+ atomic_t *counter;
int tx_pending;

if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
return;

- if (!tx_completed) {
- if (sta)
- atomic_add(tx_airtime,
- &sta->airtime[ac].aql_tx_pending);
+ if (sta)
+ counter = &sta->airtime[ac].aql_tx_pending;
+ else
+ counter = &local->aql_bc_pending_airtime;

+ if (!tx_completed) {
+ atomic_add(tx_airtime, counter);
atomic_add(tx_airtime, &local->aql_total_pending_airtime);
atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]);
return;
}

- if (sta) {
- tx_pending = atomic_sub_return(tx_airtime,
- &sta->airtime[ac].aql_tx_pending);
- if (tx_pending < 0)
- atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
- tx_pending, 0);
- }
+ tx_pending = atomic_sub_return(tx_airtime, counter);
+ if (tx_pending < 0)
+ atomic_cmpxchg(counter, tx_pending, 0);

atomic_sub(tx_airtime, &local->aql_total_pending_airtime);
tx_pending = atomic_sub_return(tx_airtime,
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3958,9 +3958,8 @@ begin:
encap_out:
IEEE80211_SKB_CB(skb)->control.vif = vif;

- if (tx.sta &&
- wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
- bool ampdu = txq->ac != IEEE80211_AC_VO;
+ if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
+ bool ampdu = txq->sta && txq->ac != IEEE80211_AC_VO;
u32 airtime;

airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
@@ -4026,6 +4025,7 @@ struct ieee80211_txq *ieee80211_next_txq
struct ieee80211_txq *ret = NULL;
struct txq_info *txqi = NULL, *head = NULL;
bool found_eligible_txq = false;
+ bool aql_check;

spin_lock_bh(&local->active_txq_lock[ac]);

@@ -4049,26 +4049,26 @@ struct ieee80211_txq *ieee80211_next_txq
if (!head)
head = txqi;

+ aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
+ if (aql_check)
+ found_eligible_txq = true;
+
if (txqi->txq.sta) {
struct sta_info *sta = container_of(txqi->txq.sta,
struct sta_info, sta);
- bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
- s32 deficit = ieee80211_sta_deficit(sta, txqi->txq.ac);
-
- if (aql_check)
- found_eligible_txq = true;
-
- if (deficit < 0)
+ if (ieee80211_sta_deficit(sta, txqi->txq.ac) < 0) {
sta->airtime[txqi->txq.ac].deficit +=
sta->airtime_weight << AIRTIME_QUANTUM_SHIFT;
-
- if (deficit < 0 || !aql_check) {
- list_move_tail(&txqi->schedule_order,
- &local->active_txqs[txqi->txq.ac]);
- goto begin;
+ aql_check = false;
}
}

+ if (!aql_check) {
+ list_move_tail(&txqi->schedule_order,
+ &local->active_txqs[txqi->txq.ac]);
+ goto begin;
+ }
+
if (txqi->schedule_round == local->schedule_round[ac])
goto out;

@@ -4133,7 +4133,8 @@ bool ieee80211_txq_airtime_check(struct
return true;

if (!txq->sta)
- return true;
+ return atomic_read(&local->aql_bc_pending_airtime) <
+ local->aql_txq_limit_bc;

if (unlikely(txq->tid == IEEE80211_NUM_TIDS))
return true;
@@ -4182,15 +4183,15 @@ bool ieee80211_txq_may_transmit(struct i

spin_lock_bh(&local->active_txq_lock[ac]);

- if (!txqi->txq.sta)
- goto out;
-
if (list_empty(&txqi->schedule_order))
goto out;

if (!ieee80211_txq_schedule_airtime_check(local, ac))
goto out;

+ if (!txqi->txq.sta)
+ goto out;
+
list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac],
schedule_order) {
if (iter == txqi)

138 comments on commit 95e633e

@nxhack
Copy link
Contributor

@nxhack nxhack commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, after applying this patch, the same issue occurred.
95e633e#commitcomment-139794215

GL-MT6000 MT7986AV

==> /sys/kernel/debug/ieee80211/phy0/aql_pending <==
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us

==> /sys/kernel/debug/ieee80211/phy1/aql_pending <==
AC     AQL pending
VO     0 us
VI     0 us
BE     1464 us
BK     0 us
BC/MC  0 us
total  1464 us

I went out with my iPhone and when I came back, the value increased. For the sake of experimentation, I went out again in the same way, turned off the iPhone's Wi-Fi on the way back, and when I returned home, the values increased again. It seems possible that this behavior occurs when the iPhone goes out of range with the Wi-Fi connected.

@nxhack
Copy link
Contributor

@nxhack nxhack commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continuation of the experiment. iPhoen was placed in a shielded case to prevent radio waves from reaching it. The values changed as expected.

==> /sys/kernel/debug/ieee80211/phy0/aql_pending <==
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us

==> /sys/kernel/debug/ieee80211/phy1/aql_pending <==
AC     AQL pending
VO     0 us
VI     0 us
BE     8080 us
BK     0 us
BC/MC  0 us
total  8080 us

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rany2, could you please post a full crash log? I only see the stack trace, but not the context that comes before it.

@nbd168 well, those patches did nothing; I guess it was just a coincidence that it lasted for this long. For context about all those wlX-apY.Z, I have per_sta_vif enabled.

Panic#2 Part1
<6>[25709.572040] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[25709.578218] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[25709.584344] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered allmulticast mode
<6>[25709.591704] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered promiscuous mode
<6>[25709.598964] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[25709.605056] br-vlan1: port 6(wl0-ap0.4097) entered forwarding state
<6>[25840.966772] br-vlan200: port 4(wl0-ap0.4098) entered disabled state
<6>[25841.006235] mt7622-wmac 18000000.wmac wl0-ap0.4098 (unregistering): left allmulticast mode
<6>[25841.014524] mt7622-wmac 18000000.wmac wl0-ap0.4098 (unregistering): left promiscuous mode
<6>[25841.022733] br-vlan200: port 4(wl0-ap0.4098) entered disabled state
<6>[25841.819641] br-vlan200: port 2(wl0-ap0.4098) entered blocking state
<6>[25841.825972] br-vlan200: port 2(wl0-ap0.4098) entered disabled state
<6>[25841.832276] mt7622-wmac 18000000.wmac wl0-ap0.4098: entered allmulticast mode
<6>[25841.839629] mt7622-wmac 18000000.wmac wl0-ap0.4098: entered promiscuous mode
<6>[25841.846836] br-vlan200: port 2(wl0-ap0.4098) entered blocking state
<6>[25841.853098] br-vlan200: port 2(wl0-ap0.4098) entered forwarding state
<6>[26013.875018] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26013.934657] mt7622-wmac 18000000.wmac wl0-ap0.4097 (unregistering): left allmulticast mode
<6>[26013.942941] mt7622-wmac 18000000.wmac wl0-ap0.4097 (unregistering): left promiscuous mode
<6>[26013.951180] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26014.689948] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[26014.696085] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26014.702206] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered allmulticast mode
<6>[26014.709567] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered promiscuous mode
<6>[26014.716810] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[26014.722898] br-vlan1: port 6(wl0-ap0.4097) entered forwarding state
<6>[26099.844793] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26099.883994] mt7622-wmac 18000000.wmac wl0-ap0.4097 (unregistering): left allmulticast mode
<6>[26099.892301] mt7622-wmac 18000000.wmac wl0-ap0.4097 (unregistering): left promiscuous mode
<6>[26099.900567] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26100.652621] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[26100.658760] br-vlan1: port 6(wl0-ap0.4097) entered disabled state
<6>[26100.664914] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered allmulticast mode
<6>[26100.672258] mt7622-wmac 18000000.wmac wl0-ap0.4097: entered promiscuous mode
<6>[26100.679449] br-vlan1: port 6(wl0-ap0.4097) entered blocking state
<6>[26100.685548] br-vlan1: port 6(wl0-ap0.4097) entered forwarding state
<1>[26100.690706] Unable to handle kernel read from unreadable memory at virtual address 0000000000000000
<1>[26100.700856] Mem abort info:
<1>[26100.703659]   ESR = 0x0000000096000005
<1>[26100.707400]   EC = 0x25: DABT (current EL), IL = 32 bits
<1>[26100.712703]   SET = 0, FnV = 0
<1>[26100.715755]   EA = 0, S1PTW = 0
<1>[26100.718887]   FSC = 0x05: level 1 translation fault
<1>[26100.723761] Data abort info:
<1>[26100.726631]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
<1>[26100.732106]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
<1>[26100.737163]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
<1>[26100.742467] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000043cc3000
<1>[26100.748906] [0000000000000000] pgd=0800000043cc7003, p4d=0800000043cc7003, pud=0800000043cc7003, pmd=0000000000000000
<0>[26100.759522] Internal error: Oops: 0000000096000005 [#1] SMP
<7>[26100.765087] Modules linked in: pppoe ppp_async nft_fib_inet nf_flow_table_inet wireguard pppox ppp_generic nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject_bridge nft_reject nft_redir nft_quota nft_numgen nft_nat nft_meta_bridge nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack_bridge nf_conntrack mt7915e(O) mt7615e(O) mt7615_common(O) mt76_connac_lib(O) mt76(O) mac80211(O) libchacha20poly1305 chacha_neon cfg80211(O) slhc poly1305_neon nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 libcurve25519_generic libcrc32c libchacha hwmon compat(O) ip_tunnel vxlan udp_tunnel ip6_udp_tunnel sha512_arm64 seqiv geniv leds_gpio gpio_button_hotplug(O)
<7>[26100.834518] CPU: 0 PID: 939 Comm: napi/phy0-5 Tainted: G           O       6.6.20 #0
<7>[26100.842255] Hardware name: Xiaomi Redmi Router AX6S (DT)
<7>[26100.847557] pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
<7>[26100.854513] pc : eth_type_trans+0x44/0x198
<7>[26100.858608] lr : ieee80211_sta_ps_transition+0x8e8/0x1d74 [mac80211]
<7>[26100.865012] sp : ffffffc080f6ba90
<7>[26100.868317] x29: ffffffc080f6ba90 x28: ffffff8002433bc8 x27: 0000000000000001
<7>[26100.875448] x26: ffffff8003486600 x25: ffffff80056ac852 x24: 0000000000000000
<7>[26100.882580] x23: ffffff80056acec0 x22: ffffffc080f6bc30 x21: ffffffc080f6bb48
<7>[26100.889712] x20: ffffff8005782800 x19: ffffff8003486600 x18: 0000000000000000
<7>[26100.896843] x17: ffffffbf8f3d1000 x16: ffffffc080000000 x15: 0000000000000000
<7>[26100.903975] x14: 00043251b88711f0 x13: 0000000000000000 x12: 0000000000000002
<7>[26100.911106] x11: 0000000000000000 x10: ffffff80034b8735 x9 : ffffff8003486650
<7>[26100.918238] x8 : 0000000000010d66 x7 : ffffff80056ac852 x6 : 0000000097f99896
<7>[26100.925369] x5 : 0000000000000007 x4 : ffffff80056ac852 x3 : 39d02aba97912000
<7>[26100.932500] x2 : ffffff8003486600 x1 : ffffff800680c000 x0 : 0000000000000000
<7>[26100.939632] Call trace:
<7>[26100.942070]  eth_type_trans+0x44/0x198
<7>[26100.945813]  ieee80211_rx_list+0x2bc/0xb60 [mac80211]
<7>[26100.950891]  mt76_rx_complete+0x1c8/0x27c [mt76]
<7>[26100.955518]  mt76_rx_poll_complete+0x2f0/0x4e0 [mt76]
<7>[26100.960576]  mt76_dma_rx_poll+0x2b4/0x7e0 [mt76]
<7>[26100.965200]  mt7615_unregister_device+0x374/0x490 [mt7615e]
<7>[26100.970777]  __napi_poll+0x34/0x184
<7>[26100.974264]  napi_threaded_poll+0xac/0x1d4
<7>[26100.978355]  kthread+0xe0/0xec
<7>[26100.981404]  ret_from_fork+0x10/0x20
<0>[26100.984977] Code: 91003880 f9006440 f9418820 f9400083 (f9400000) 
<4>[26100.991062] ---[ end trace 0000000000000000 ]---
<3>[26101.007640] pstore: backend (ramoops) writing error (-28)
<0>[26101.013034] Kernel panic - not syncing: Oops: Fatal exception in interrupt
<2>[26101.019899] SMP: stopping secondary CPUs
<0>[26101.023814] Kernel Offset: disabled
<0>[26101.027293] CPU features: 0x0,00000004,00000000,0000400b
<0>[26101.032596] Memory Limit: none

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 No idea if this is helpful, but it takes minimal effort on my part

(gdb) l *ieee80211_rx_list+0x2bc
0x2e15c is in ieee80211_rx_list (../mac80211-regular/backports-6.6.15/net/mac80211/rx.c:5084).
5079            fast_rx = rcu_dereference(rx.sta->fast_rx);
5080            if (!fast_rx)
5081                    goto drop;
5082
5083            ieee80211_rx_8023(&rx, fast_rx, skb->len);
5084            return;
5085
5086    drop:
5087            dev_kfree_skb(skb);
5088    }
(gdb) l *mt76_rx_complete+0x1c8
0x5d54 is in mt76_rx_complete (../mt76-2024-02-03-6124ea91/mac80211.c:1374).
1369                    skb_shinfo(skb)->frag_list = NULL;
1370                    mt76_rx_convert(dev, skb, &hw, &sta);
1371                    ieee80211_rx_list(hw, sta, skb, &list);
1372
1373                    /* subsequent amsdu frames */
1374                    while (nskb) {
1375                            skb = nskb;
1376                            nskb = nskb->next;
1377                            skb->next = NULL;
1378
(gdb) l *mt76_rx_poll_complete+0x2f0
0x60f8 is in mt76_rx_poll_complete (../mt76-2024-02-03-6124ea91/mac80211.c:1413).
1408                    else
1409                            mt76_rx_aggr_reorder(skb, &frames);
1410            }
1411
1412            mt76_rx_complete(dev, &frames, napi);
1413    }
1414    EXPORT_SYMBOL_GPL(mt76_rx_poll_complete);
1415
1416    static int
1417    mt76_sta_add(struct mt76_phy *phy, struct ieee80211_vif *vif,
(gdb) l *mt76_dma_rx_poll+0x2b4
0x2d4c is in mt76_dma_rx_poll (../mt76-2024-02-03-6124ea91/dma.c:956).
951
952             do {
953                     cur = mt76_dma_rx_process(dev, &dev->q_rx[qid], budget - done);
954                     mt76_rx_poll_complete(dev, qid, napi);
955                     done += cur;
956             } while (cur && done < budget);
957
958             rcu_read_unlock();
959
960             if (done < budget && napi_complete(napi))
(gdb) l *mt7615_unregister_device+0x374
0xca8 is in mt7615_poll_rx (../mt76-2024-02-03-6124ea91/mt7615/dma.c:99).
94              if (!mt76_connac_pm_ref(&dev->mphy, &dev->pm)) {
95                      napi_complete(napi);
96                      queue_work(dev->mt76.wq, &dev->pm.wake_work);
97                      return 0;
98              }
99              done = mt76_dma_rx_poll(napi, budget);
100             mt76_connac_pm_unref(&dev->mphy, &dev->pm);
101
102             return done;
103     }

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the line numbers don't make much sense, should I compile with -O0 so they make sense?

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rany2 the line numbers make perfect sense and this helped me a lot to get some context for this bug.
Please test if this mac80211 patch fixes the crash: https://nbd.name/p/f42fb22e

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 I will test now, thank you so much for your patience with me :)

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise. Without the help I'm getting from you guys, this would take much longer to resolve.
@rany2, @Fail-Safe, @nxhack - here's another attempt at resolving the pending tx airtime issue (mac80211 patch): https://nbd.name/p/5b0b4a4a
It's based on the theory that maybe packets aren't getting stuck in hardware at all, but there might be a race condition on counting completed airtime vs deleting stations.

@Fail-Safe
Copy link
Contributor

@Fail-Safe Fail-Safe commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 Always happy to help out! I'm grabbing the new pending tx airtime patch and will get to testing it in a few minutes. Thanks!

Just to confirm, you want us to also keep yesterday's patch (https://nbd.name/p/005d44af) in place in addition to this new one, yes?

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 I'll give it a spin now and will give a status update after a day or so if all goes well (or sooner if it just didn't help). Thanks again for the effort you're putting into this; I'm more than happy to help you out with testing :)

@graysky2
Copy link
Contributor

@graysky2 graysky2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, you want us to also keep yesterday's patch (https://nbd.name/p/005d44af) in place in addition to this new one, yes?

  1. Actually, is the correct test condition to apply all 3 patches? https://nbd.name/p/005d44af and https://nbd.name/p/f42fb22e and https://nbd.name/p/5b0b4a4a
  2. Is it right to simply build HEAD of openwrt dropping each of these three into /package/kernel/mt76/patches/? I believe the build system will just apply them when it compiles mt76 right?

@Fail-Safe
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2. Is it right to simply build HEAD of openwrt dropping each of these three into /package/kernel/mt76/patches/? I believe the build system will just apply them when it compiles mt76 right?

FWIW, I am running all three patches at this point. This one (https://nbd.name/p/005d44af) I placed in package/kernel/mt76/patches/ and the other two I placed into package/kernel/mac80211/patches/subsys/.

@graysky2
Copy link
Contributor

@graysky2 graysky2 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Fail-Safe - I see now that the last two needed to be applied to mac80211

wget https://nbd.name/p/005d44af -O package/kernel/mt76/patches/200-005d44af.patch
wget https://nbd.name/p/f42fb22e -O package/kernel/mac80211/patches/subsys/800-f42fb22e.patch
wget https://nbd.name/p/5b0b4a4a -O package/kernel/mac80211/patches/subsys/801-5b0b4a4a.patch

@Fail-Safe
Copy link
Contributor

@Fail-Safe Fail-Safe commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here's another attempt at resolving the pending tx airtime issue (mac80211 patch): https://nbd.name/p/5b0b4a4a
It's based on the theory that maybe packets aren't getting stuck in hardware at all, but there might be a race condition on counting completed airtime vs deleting stations.

@nbd168 I'm seeing much better behavior overall with the latest patch (5b0b4a4a) in place. However, something is still going on where phy1 on both MT6000s seems to be hanging on to 50k+ for BC/MC for some amount of seconds before clearing out and heading back to 0. Then the behavior repeats--back upwards of 50k for several seconds before dropping back to 0.

image

Is this due to 5ghz devices and power saving modes on the STAs?


Just to level-set on where I stand, especially for those just coming into the thread:

Device: 2x GL-MT6000 (MT7986)
Running snapshot build: r25541-af860c4dbf
Patches:

With this current combination of patches, the major latency spikes I was seeing 300+ ms (upwards of multiple seconds) are no longer occurring and generally (other than my notes above regarding BC/MC on phy1) there are no longer stuck packets in aql_pending.

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Fail-Safe Thanks for reporting back. I'm pretty sure that the stuck broadcast airtime (which I can also reproduce myself) is a pre-existing bug that was simply hidden. Before my AQL changes, it simply wasn't counted as pending airtime at all. There were quite a few reports over time that could be explained by broadcast packets not getting transmitted properly anymore (showing up as things like ARP failure, or failure to ping wifi clients from LAN).
I've figured out that the issues are triggered by beacon updates after client connect/disconnect, but so far wasn't able to figure out a solution yet. I probably need to contact MTK about this. If I skip beacon updates, the issue disappears.

@graysky2 You can skip the mt76 patch - I don't think it makes any difference based on reports so far.

By the way, I've pushed some more fixes from the MTK tree to mt76 master. I will update the mt76 package in OpenWrt soon.

@graysky2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @nbd168 - I am currently running the latest HEAD plus the 5 patches @Fail-Safe called out several posts above. I will post back in the OpenWrt thread with the result to keep this conversation cleaner since I do not know if the bug reported here is the same one affecting me.

@graysky2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168

By the way, I've pushed some more fixes from the MTK tree to mt76 master. I will update the mt76 package in OpenWrt soon.

Once you update the OpenWrt tree, which, if any, of the aforementioned patches do you recommend I apply?

@Fail-Safe
Copy link
Contributor

@Fail-Safe Fail-Safe commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@graysky2 I know I'm not @nbd168, but FWIW I'll share what I've done.

I upgraded my snapshot build to pick up the mt76 repo updates that Felix pushed, up through openwrt/mt76@b4a9174.

From what I can tell, his updates include the 0016 and 0017 patches that Bo Jiao introduced, so I've removed them. I also removed the additional mt76 patch (005d44af) that Felix said could be dropped per:

You can skip the mt76 patch - I don't think it makes any difference based on reports so far.

I am still running the two mac80211 patches, f42fb22e and 5b0b4a4a.


So, to summarize what I am running right now, and is working well:

@richardkendi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@graysky2 I know I'm not @nbd168, but FWIW I'll share what I've done.

I upgraded my snapshot build to pick up the mt76 repo updates that Felix pushed, up through openwrt/mt76@b4a9174.

From what I can tell, his updates include the 0016 and 0017 patches that Bo Jiao introduced, so I've removed them. I also removed the additional mt76 patch (005d44af) that Felix said could be dropped per:

You can skip the mt76 patch - I don't think it makes any difference based on reports so far.

I am still running the two mac80211 patches, f42fb22e and 5b0b4a4a.

So, to summarize what I am running right now, and is working well:

Could you share your firmware so I can test it on my MT6000?

@nxhack
Copy link
Contributor

@nxhack nxhack commented on 95e633e Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent. @nbd168

I put my iPhone in the radio shield and tested it. No problem.
Thank you.

@rany2
Copy link
Contributor

@rany2 rany2 commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far it seems promising on my end. I think the (unrelated) per_sta_vif kernel panic is solved (https://nbd.name/p/f42fb22e) and I haven't had any issues with stuck packets after applying https://nbd.name/p/5b0b4a4a.

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for testing. Both mac80211 fixes are in OpenWrt now.

@PussAzuki
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it now possible to just recompile the firmware? Or do I need to wait a little longer for your tests? 🤔

@Fail-Safe
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PussAzuki I would tend to say with Felix’s commits in tree now, it would be a good time to recompile an image:

163c87d
dea42f6

I don’t see a commit yet for syncing up to HEAD on the mt76 repo. But I’m sure that will be coming in due time.

@Gingernut1978
Copy link
Contributor

@Gingernut1978 Gingernut1978 commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168

Sorry for hijacking but just to inform you thet I had to revert mt76 b4a917417c856307fe19fb6a1a2819e3270df22e commit.

Gives a mt798-mac: probe of 18000000.wifi failed with error -2 on my Asus filogic mt7986 device.

Thank you for your support.

@Fail-Safe
Copy link
Contributor

@Fail-Safe Fail-Safe commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gives a mt798-mac: probe of 18000000.wifi failed with error -2 on my Asus filogic mt7986 device.

Are you seeing any bad behavior from that error, or just noting the presence of it?

I also see these messages on my both of my MT6000s (mt7986) a little after a reboot, but have found them to be benign to this point:

AP1:

...
[   22.712250] br-lan: port 11(phy1-ap1) entered forwarding state
[   23.667338] mt7530-mdio mdio-bus:1f lan4: Link is Up - 1Gbps/Full - flow control off
[   23.675104] br-lan: port 5(lan4) entered blocking state
[   23.680340] br-lan: port 5(lan4) entered forwarding state
[   24.666988] mt7530-mdio mdio-bus:1f lan5: Link is Up - 100Mbps/Full - flow control off
[   24.674945] br-lan: port 6(lan5) entered blocking state
[   24.680177] br-lan: port 6(lan5) entered forwarding state
[  284.480589] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.489473] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.498338] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.506080] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.513821] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.522703] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.530455] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.538184] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.547064] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.554808] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.562543] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.571413] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.579155] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.586891] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  310.044086] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  310.051837] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  310.059592] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  310.067354] mt798x-wmac 18000000.wifi phy1-ap0: failed (err=-2) to del object (id=3)
[  310.075093] mt798x-wmac 18000000.wifi phy0-ap0: failed (err=-2) to del object (id=3)
[  310.082844] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  310.090574] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  310.098304] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  310.107185] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  310.114959] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)

AP2:

...
[   22.744490] br-lan: port 11(phy1-ap1) entered disabled state
[   22.778723] br-lan: port 11(phy1-ap1) entered blocking state
[   22.784379] br-lan: port 11(phy1-ap1) entered disabled state
[   22.790309] device phy1-ap1 entered promiscuous mode
[   22.880074] IPv6: ADDRCONF(NETDEV_CHANGE): phy1-ap1: link becomes ready
[   22.886983] br-lan: port 11(phy1-ap1) entered blocking state
[   22.892641] br-lan: port 11(phy1-ap1) entered forwarding state
[  284.471469] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.480356] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.489208] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.496945] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.504679] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.513545] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.521281] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.529007] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.537875] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.545608] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.553344] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  284.562215] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  284.569945] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  284.577682] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  289.586367] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  289.594112] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  289.601860] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  289.609630] mt798x-wmac 18000000.wifi phy1-ap0: failed (err=-2) to del object (id=3)
[  289.617374] mt798x-wmac 18000000.wifi phy0-ap0: failed (err=-2) to del object (id=3)
[  289.625118] mt798x-wmac 18000000.wifi phy0-ap1: failed (err=-2) to del object (id=3)
[  289.632851] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  289.640583] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  289.649471] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)
[  289.657249] mt798x-wmac 18000000.wifi phy0-ap2: failed (err=-2) to del object (id=3)
[  289.665006] mt798x-wmac 18000000.wifi phy1-ap1: failed (err=-2) to del object (id=3)

@Fail-Safe
Copy link
Contributor

@Fail-Safe Fail-Safe commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 In looking at my kernel log for the prior message I sent, I did happen to just notice this sitting in one of my AP's logs:

[20559.072434] platform 15010000.wed: Message 00000006 (seq 4880) timeout
[20559.079017] ------------[ cut here ]------------
[20559.083616] WARNING: CPU: 0 PID: 8990 at ___ieee80211_stop_tx_ba_session+0x2f4/0x340 [mac80211]
[20559.092338] Modules linked in: nft_fib_inet nf_flow_table_inet iptable_nat xt_state xt_nat xt_conntrack xt_REDIRECT xt_MASQUERADE nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_objref nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack mt7915e mt76_connac_lib mt76 mac80211 iptable_mangle iptable_filter ipt_REJECT ip_tables cfg80211 xt_time xt_tcpudp xt_multiport xt_mark xt_mac xt_limit xt_comment xt_TCPMSS xt_LOG x_tables tcp_bbr nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c compat cls_flower act_vlan crypto_safexcel cls_bpf act_bpf sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact sha512_arm64 sha1_ce sha1_generic seqiv md5 des_generic libdes authencesn authenc leds_gpio xhci_plat_hcd xhci_pci
[20559.092490]  xhci_mtk_hcd xhci_hcd gpio_button_hotplug usbcore usb_common aquantia
[20559.186187] CPU: 0 PID: 8990 Comm: kworker/u8:3 Not tainted 6.1.81 #0
[20559.192608] Hardware name: GL.iNet GL-MT6000 (DT)
[20559.197296] Workqueue: events_unbound wiphy_rfkill_set_hw_state_reason [cfg80211]
[20559.204785] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[20559.211727] pc : ___ieee80211_stop_tx_ba_session+0x2f4/0x340 [mac80211]
[20559.218347] lr : ___ieee80211_stop_tx_ba_session+0x1f0/0x340 [mac80211]
[20559.224958] sp : ffffffc0092fbc70
[20559.228255] x29: ffffffc0092fbc70 x28: 0000000000000000 x27: 000000000000001e
[20559.235371] x26: ffffff8004270ec0 x25: ffffff80042708a0 x24: ffffff80042708a0
[20559.242487] x23: ffffffc000b2f4e8 x22: ffffff80057a60e8 x21: 0000000000000002
[20559.249602] x20: ffffff8005d65200 x19: ffffff80039d6000 x18: 0000000000000160
[20559.256717] x17: ffffffffffffc800 x16: 00000000000067f8 x15: ffffffc008b399c0
[20559.263832] x14: 0000000000000420 x13: 0000000000000160 x12: 00000000ffffffea
[20559.270947] x11: 0000000000000040 x10: ffffffc008b36c70 x9 : ffffffc008b36c68
[20559.278062] x8 : 0000000000000002 x7 : 0000000000000000 x6 : 000000353beb8343
[20559.285177] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000
[20559.292292] x2 : 0000000000000001 x1 : 0000000000000002 x0 : 00000000ffffff92
[20559.299408] Call trace:
[20559.301839]  ___ieee80211_stop_tx_ba_session+0x2f4/0x340 [mac80211]
[20559.308105]  __ieee80211_stop_tx_ba_session+0x38/0x60 [mac80211]
[20559.314111]  ieee80211_process_delba+0x38/0x40 [mac80211]
[20559.319508]  ieee80211_ibss_leave+0xbd8/0x1970 [mac80211]
[20559.324905]  wiphy_rfkill_set_hw_state_reason+0x124/0x180 [cfg80211]
[20559.331250]  process_one_work+0x210/0x394
[20559.335247]  worker_thread+0x160/0x4b4
[20559.338980]  kthread+0xd4/0xe0
[20559.342021]  ret_from_fork+0x10/0x20
[20559.345582] ---[ end trace 0000000000000000 ]---

However, the AP seems to be up and functional still, so it doesn't seem to have been an unrecoverable error. If this is unrelated to any of the changes from this current issue, and if it warrants an mt76 repo issue, please let me know and I will open one there. Thanks!

@Gingernut1978
Copy link
Contributor

@Gingernut1978 Gingernut1978 commented on 95e633e Mar 16, 2024 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Fail-Safe
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are without WiFi

Interesting—as I mentioned this is not the case for me.

@graysky2
Copy link
Contributor

@graysky2 graysky2 commented on 95e633e Mar 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nor I so far on flogic/xiaomi_redmi-router-ax6000-ubootmod. I patched mt76 with the latest from that repo, and built openwrt master branch from the latest commit.. So far, I do not see any problems.

@Gingernut1978
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange then.

I'll need to test again but openwrt master and mt76 with commit e5fb6995e7eb99763a98e687376315281f47b220 as tip works.

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gingernut1978 what's the model name of your device?

@Gingernut1978
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 Asus tuf-ax4200

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gingernut1978 the reason for the probe failure is the fact that the .dts for your device is missing the pre-calibration data. I've added a graceful fallback in this patch: https://nbd.name/p/79b39778
Please try it on top of mt76 master and show me the warning that it will print at boot regarding the missing precal data.

@Gingernut1978
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nbd168 are these few lines enough?

Mon Mar 18 07:01:51 2024 kern.info kernel: [   15.246416]
Mon Mar 18 07:01:51 2024 kern.info kernel: [   15.595532] mt798x-wmac 18000000.wifi: WM Firmware Version: ____000000, Build Time: 20221012174725
Mon Mar 18 07:01:51 2024 kern.info kernel: [   15.761099] mt798x-wmac 18000000.wifi: WA Firmware Version: DEV_000000, Build Time: 20221012174937
Mon Mar 18 07:01:51 2024 kern.warn kernel: [   15.868544] mt798x-wmac 18000000.wifi: missing precal data, size=403472
Mon Mar 18 07:01:51 2024 kern.info kernel: [   15.875205] mt798x-wmac 18000000.wifi: registering led 'mt76-phy0'
Mon Mar 18 07:01:51 2024 kern.info kernel: [   15.882779] mt798x-wmac 18000000.wifi: registering led 'mt76-phy1'

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gingernut1978 yes, thanks. Does mt76 work properly for you now?

@Gingernut1978
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gingernut1978 yes, thanks. Does mt76 work properly for you now?

Yes it does.

Thank you once again.

@peterbarta
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still have some problem with packets stuck in AQL que while using build with the patches and also latest mt76 driver. Device: GL-MT6000

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peterbarta please show me the stats while the issue occurs

@graysky2
Copy link
Contributor

@graysky2 graysky2 commented on 95e633e Mar 18, 2024 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peterbarta
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@graysky2 yes, this is the case. I thought it might be connected with the problem that I have trouble loading pages on Wifi. Sometimes Github, Facebook pics, Insta feed and sometimes even Reddit feed. The symptoms felt like the issue described above. Also I have problems in online games (LoL) when lots of packets being exchanged, some of them dropped causing lag. 5 GHz AX, 160 mhz. Channels in use are not congested.

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peterbarta please try 80 MHz to see if it improves things.

@nxhack
Copy link
Contributor

@nxhack nxhack commented on 95e633e Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference.

I use 80MHz regularly. (Because DFS is too slow for AP to be enabled and throughput is no different than 80MHz.)
The previous problem has been resolved at 80MHz.

@graysky2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just so I am understanding. When I do a speed test, I am finding the following. Is it normal or abnormal?

# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
AC     AQL pending
VO     0 us
VI     0 us
BE     4816 us
BK     0 us
BC/MC  488 us
total  4816 us

@nbd168
Copy link
Member Author

@nbd168 nbd168 commented on 95e633e Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@graysky2 while doing the test, it's normal. When idle, counters should return to 0.

@graysky2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @nbd168 - I am seeing this behavior on both my MT6000 and AX6000. Great job with the drivers!

@JiaY-shi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     1680 us
BE     6928 us
BK     516 us
BC/MC  0 us
total  9124 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# 
root@OpenWrt:~# 
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     1680 us
BE     6928 us
BK     516 us
BC/MC  0 us
total  9124 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# 

@nbd168 This data result is from my GL-MT6000, using the latest mainline code and the latest mt76 driver, the only adjustment I made was to use the latest mt7986 firmware from mtk-openwrt-feeds, I will roll the mt7986 firmware and continue Test and report results.

@PussAzuki
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @nbd168 - I am seeing this behavior on both my MT6000 and AX6000. Great job with the drivers!

You mean redmi ax6000 doesn't have the high ping latency problem anymore?
🤔 Do I have to wait a little longer?

@graysky2
Copy link
Contributor

@graysky2 graysky2 commented on 95e633e Mar 19, 2024 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JiaY-shi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# 
root@OpenWrt:~# 
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us

This is the latest value of my gl-mt6000. I have been observing it for a while and it seems to have remained unchanged. The current running time is less than 24 hours. Is it normal or abnormal?

@graysky2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JiaY-shi - are your results from a busy network or an idle network? As I understand it, positive values are normal when traffic is flowing but they should return to 0 when no traffic is flowing:

while doing the test, it's normal. When idle, counters should return to 0.

@JiaY-shi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JiaY-shi - are your results from a busy network or an idle network? As I understand it, positive values are normal when traffic is flowing but they should return to 0 when no traffic is flowing:

while doing the test, it's normal. When idle, counters should return to 0.

From the idle time, there were less than 3 wireless devices connected to the router. I observed the same value for more than an hour. Later, when I paid attention to it, it had become 0.

@PussAzuki
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My redmi ax6000 mesh look like work well now☺️

@taylorkline
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# 
root@OpenWrt:~# 
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us

This is the latest value of my gl-mt6000. I have been observing it for a while and it seems to have remained unchanged. The current running time is less than 24 hours. Is it normal or abnormal?

What release are you running?

@JiaY-shi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# 
root@OpenWrt:~# 
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     512 us
BE     3568 us
BK     2396 us
BC/MC  0 us
total  6476 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us

This is the latest value of my gl-mt6000. I have been observing it for a while and it seems to have remained unchanged. The current running time is less than 24 hours. Is it normal or abnormal?

What release are you running?

The compiled version is this f84ed09
This is the latest value I've seen, and it's also a low load. There seems to be no problem with it so far, I will continue to test it.

root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     6880 us
BK     0 us
BC/MC  0 us
total  6880 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# date
Thu Mar 21 09:49:16 CST 2024
root@OpenWrt:~# 
root@OpenWrt:~# 
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     6880 us
BK     0 us
BC/MC  0 us
total  6880 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# date
Thu Mar 21 09:49:50 CST 2024
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     6900 us
BK     0 us
BC/MC  0 us
total  6900 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# date
Thu Mar 21 09:53:09 CST 2024
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     6880 us
BK     0 us
BC/MC  0 us
total  6880 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# date
Thu Mar 21 09:54:46 CST 2024
root@OpenWrt:~# cat /sys/kernel/debug/ieee80211/*/aql_pending
AC     AQL pending
VO     0 us
VI     0 us
BE     6880 us
BK     0 us
BC/MC  0 us
total  6880 us
AC     AQL pending
VO     0 us
VI     0 us
BE     0 us
BK     0 us
BC/MC  0 us
total  0 us
root@OpenWrt:~# date
Thu Mar 21 10:00:49 CST 2024
root@OpenWrt:~# 

@nxhack
Copy link
Contributor

@nxhack nxhack commented on 95e633e Mar 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JiaY-shi

I'm just guessing and not sure.

Do you have disassoc_low_ack '0' set?

If so, try this: what happens if you try to explicitly disconnect any remaining unconnected clients?

@peterbarta
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflecting to my earlier issue: WED was causing the problem.

@PussAzuki
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have wed on snapshot and don't have any problem.

@peterbarta
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have wed on snapshot and don't have any problem.

I have lots of retransmitted TCP packets and TCP ACKed unseen segment messages when WED was enabled. When I disabled all the issues went away and never came back since.

@Fail-Safe
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have wed on snapshot and don't have any problem.

I have lots of retransmitted TCP packets and TCP ACKed unseen segment messages when WED was enabled. When I disabled all the issues went away and never came back since.

Open a new issue with logs and additional info, perhaps?

Please sign in to comment.