# 基于ps及pandas的服务器内存数据统计

## 采集数据

In [1]:
# 在待分析的服务器上执行命令

# 确定采集时间
# date
# 2023年 01月 17日 星期二 14:30:19 CST

# 通过ps命令及其命令行参数，将需要的数据输出为csv格式 
# ps -e -o %c, -o rss --sort=-rss > mems.csv

In [2]:
# 原数据be like：
! cat mems.csv | head -n 5

COMMAND        ,  RSS
qemu-system-x86,33823984
qemu-system-x86,16924120
qemu-system-x86,16866464
qemu-system-x86,8627980


## 导入数据

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("mems.csv",delimiter=",",)
# 索引从1开始
df.index=df.index+1
# header移除空格
df.columns = df.columns.map(lambda x: x.strip())
# 预览：一共1026个进程
df

Unnamed: 0,COMMAND,RSS
1,qemu-system-x86,33823984
2,qemu-system-x86,16924120
3,qemu-system-x86,16866464
4,qemu-system-x86,8627980
5,mysqld,7683188
...,...,...
1022,kworker/50:1-mm,0
1023,kworker/55:1-mm,0
1024,kworker/0:0-eve,0
1025,kworker/47:0-i4,0


## 处理数据

In [5]:
# 过滤0值，剩下324个进程
df = df[df.RSS>0]
# 这324个进程共占用了115G内存
df["RSS"].sum()/1024/1024

115.3391342163086

In [6]:
df

Unnamed: 0,COMMAND,RSS
1,qemu-system-x86,33823984
2,qemu-system-x86,16924120
3,qemu-system-x86,16866464
4,qemu-system-x86,8627980
5,mysqld,7683188
...,...,...
320,kolla_start,4
321,kolla_start,4
322,kolla_start,4
323,kolla_start,4


In [7]:
df = df.copy()
# 增加一列，将RSS单位改成MB
df["RSSMB"] = df.loc[:, ('RSS')].map(lambda x: x/1024)
# 增加GB列
df["RSSGB"] = df.loc[:, ('RSS')].map(lambda x: x/1024/1024)
df.describe()

Unnamed: 0,RSS,RSSMB,RSSGB
count,324.0,324.0,324.0
mean,373277.3,364.528622,0.355985
std,2402061.0,2345.762337,2.290784
min,4.0,0.003906,4e-06
25%,3825.0,3.735352,0.003648
50%,11166.0,10.904297,0.010649
75%,106137.0,103.649414,0.10122
max,33823980.0,33031.234375,32.257065


In [8]:
# 增加一列，显示累计内存消耗量(GB)
df["RSS_GB_CUM"] = df["RSSGB"].cumsum()
df.describe()

Unnamed: 0,RSS,RSSMB,RSSGB,RSS_GB_CUM
count,324.0,324.0,324.0,324.0
mean,373277.3,364.528622,0.355985,111.379911
std,2402061.0,2345.762337,2.290784,8.440401
min,4.0,0.003906,4e-06,32.257065
25%,3825.0,3.735352,0.003648,110.188729
50%,11166.0,10.904297,0.010649,114.734324
75%,106137.0,103.649414,0.10122,115.253636
max,33823980.0,33031.234375,32.257065,115.339134


## 统计分析

In [9]:
# 按内存用量倒排，前15个进程用了100GB内存，剩下的15G由其他几百个进程消耗
df.head(20)

Unnamed: 0,COMMAND,RSS,RSSMB,RSSGB,RSS_GB_CUM
1,qemu-system-x86,33823984,33031.234375,32.257065,32.257065
2,qemu-system-x86,16924120,16527.460938,16.140099,48.397163
3,qemu-system-x86,16866464,16471.15625,16.085114,64.482277
4,qemu-system-x86,8627980,8425.761719,8.228283,72.71056
5,mysqld,7683188,7503.113281,7.327259,80.037819
6,qemu-system-x86,4412644,4309.222656,4.208225,84.246044
7,ceph-osd,3034632,2963.507812,2.894051,87.140095
8,ceph-osd,2644688,2582.703125,2.522171,89.662266
9,ceph-osd,2379332,2323.566406,2.269108,91.931374
10,ceph-osd,2281156,2227.691406,2.17548,94.106853


In [10]:
# 统计进程数量，倒排
df["COMMAND"].value_counts().head(20)

httpd              67
containerd-shim    42
kolla_start        36
neutron-server     13
nova-api           11
heat-api-cfn        6
heat-api            6
neutron-metadat     6
glance-registry     6
heat-engine         6
nova-conductor      6
glance-api          6
qemu-system-x86     6
ceph-osd            6
bash                4
dnsmasq             4
sleep               4
sshd                3
haproxy             3
keepalived          3
Name: COMMAND, dtype: int64

In [11]:
pd.set_option('display.max_rows',None)
# 按进程名称分组统计内存用量
df.groupby("COMMAND")[["RSSMB", "RSSGB"]].sum().sort_values(by="RSSMB", ascending=False)

Unnamed: 0_level_0,RSSMB,RSSGB
COMMAND,Unnamed: 1_level_1,Unnamed: 2_level_1
qemu-system-x86,80609.707031,78.720417
ceph-osd,13684.550781,13.363819
mysqld,7503.113281,7.327259
httpd,3424.902344,3.344631
neutron-server,1675.730469,1.636456
nova-api,1672.753906,1.633549
ceph-mon,930.753906,0.908939
ovs-vswitchd,713.925781,0.697193
glance-api,682.386719,0.666393
nova-conductor,674.988281,0.659168


## 可疑进程排查
接下来就可以结合进程数量和进程消耗内存总量，逐个排查可疑进程