## 处理数据的脚本 （从头到尾）A workflow to process data

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import BoundaryNorm
import time

data = np.loadtxt('F:\data.txt')
df = pd.DataFrame(data)

df = df.iloc[1:] # 删除前三行 第一个数据可能不准确

In [48]:
# 让脸部温度单独保存，环境温度设计为统一值
ta = df.min(axis=1)

df_face = pd.DataFrame()
# df_face 脸部温度+其他区域温度置换为环境温度
df_onlyface = pd.DataFrame()
# df_onlyface 只有脸部温度点
for i, minTa in zip(df.values, ta):
    face = []
    onlyface = []
    for j in i:
        if j - minTa > 7:
            face.append(j)
            onlyface.append(j)
        else:
            face.append(minTa)
    face_todf = pd.DataFrame(face).T
    onlyface_todf = pd.DataFrame(onlyface).T
    df_face = pd.concat([df_face, face_todf], axis = 0)
    df_onlyface = pd.concat([df_onlyface, onlyface_todf], axis = 0)

In [49]:
# 重置index为0,1,2,3...
df_onlyface.index = range(df_onlyface.shape[0])
ta.index = range(ta.shape[0])

In [50]:
# 把脸部温度数据和环境温度合并成一个df
if df_onlyface.shape[0] == ta.shape[0]:
    df_face_ta = pd.concat([df_onlyface, ta], axis=1, ignore_index=True)

In [51]:
# 数据点应该超过300个小于500个
df_face_ta = df_face_ta[df_face_ta.count(axis=1)>300]
df_face_ta = df_face_ta[df_face_ta.count(axis=1)<500]

In [77]:
df_face_ta

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,413,414,415,416,417,418,419,420,421,422
0,33.42545,33.09329,33.26901,32.93668,33.38947,34.06543,34.73529,35.43115,35.20297,34.65469,...,35.31952,35.6059,35.8161,36.07593,35.53232,36.13458,36.27026,36.9578,33.95194,25.76224
1,33.06983,33.46301,33.36023,34.47962,33.95212,35.04477,34.31589,34.53183,33.05396,33.32291,...,,,,,,,,,,25.96768
3,32.99136,33.28964,33.16177,33.43442,32.66815,33.27585,33.88684,33.86365,33.50427,33.88623,...,,,,,,,,,,25.54703
4,33.01471,32.7171,32.88144,33.38309,33.3457,32.77686,33.23718,33.52106,32.88538,33.39148,...,,,,,,,,,,25.68735
5,33.17325,33.50562,33.047,32.95013,33.67352,33.04413,33.88745,33.85254,33.38236,33.1221,...,,,,,,,,,,25.82947
6,33.17697,33.23956,33.35843,33.55725,34.01843,34.23636,33.8909,34.29001,33.30472,33.74808,...,,,,,,,,,,26.12711
7,33.15878,33.34018,33.19791,33.7442,33.88306,34.22022,34.10184,33.54553,33.53937,33.19751,...,,,,,,,,,,25.97864
8,33.35385,33.47577,33.7822,33.59946,34.17221,33.53363,32.82806,33.54163,33.47534,33.60016,...,,,,,,,,,,25.80859
9,32.68195,32.74173,32.91168,33.13211,33.67697,34.30597,34.43607,34.53314,35.01764,34.83777,...,,,,,,,,,,25.61929


In [78]:
# 先把ta拿掉坐频数统计，再把ta连接回去
df_face_ = df_face_ta.iloc[:, :-1]

# 指定划分bin的点
bins = [28.3, 28.6, 28.9, 29.2, 29.5,
 29.8, 30.1, 30.4, 30.7, 31.0,  31.3,
 31.6, 31.9, 32.2, 32.5,  32.8, 33.1,
 33.4, 33.7, 34.0,  34.3, 34.6, 34.9,
 35.2, 35.5, 35.8, 36.1, 36.4, 36.7]

temDistribution = pd.DataFrame()
for i in df_face_.values:
    i = [j for j in i if not np.isnan(j)]
    N, _ = np.histogram(i, bins=bins)
    # 返回各区域频数N
    N_todf = pd.DataFrame(N).T
    temDistribution = pd.concat([temDistribution, N_todf])

In [85]:
N, bin = np.histogram(np.log(df_face.iloc[5]))

In [87]:
bin

array([3.25151609, 3.28644676, 3.32137743, 3.3563081 , 3.39123877,
       3.42616945, 3.46110012, 3.49603079, 3.53096146, 3.56589213,
       3.6008228 ])

In [55]:
# 计算各列占行和的百分数
for i in range(temDistribution.shape[0]):
    temDistribution.iloc[i] = temDistribution.iloc[i] / temDistribution.iloc[i].sum()

In [57]:
# 总是因为index不同，无法强行连接，重设一下index
ta = df_face_ta.iloc[:,-1]
ta.index = range(ta.shape[0])
temDistribution.index = range(temDistribution.shape[0])

# 把各区域频数和环境温度列连接起来
if temDistribution.shape[0] == df_face_ta.iloc[:,-1].shape[0]:
    temDistribution = pd.concat([temDistribution, ta], axis=1)

In [58]:
# 重设index和columns
temDistribution.index = [i for i in range(temDistribution.shape[0])]
temDistribution.columns = ["p{}".format(i) for i in range(temDistribution.shape[1]-1)] + ["ta"]
temDistribution

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p16,p17,p18,p19,p20,p21,p22,p23,p24,ta
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.042105,0.047368,0.068421,0.021053,0.110526,0.094737,0.173684,0.178947,0.263158,25.76224
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006173,0.04321,0.074074,0.067901,0.067901,0.111111,0.141975,0.209877,0.277778,25.96768
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044025,0.044025,0.050314,0.037736,0.081761,0.100629,0.169811,0.207547,0.251572,25.54703
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037433,0.032086,0.074866,0.058824,0.074866,0.13369,0.106952,0.208556,0.262032,25.68735
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028736,0.04023,0.068966,0.045977,0.132184,0.114943,0.114943,0.218391,0.235632,25.82947
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.020979,0.034965,0.090909,0.076923,0.125874,0.181818,0.230769,0.237762,26.12711
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0625,0.075,0.05,0.08125,0.1375,0.11875,0.2,0.275,25.97864
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025157,0.025157,0.075472,0.069182,0.106918,0.119497,0.163522,0.169811,0.245283,25.80859
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041237,0.051546,0.061856,0.061856,0.061856,0.128866,0.149485,0.21134,0.226804,25.61929


在此输入投票结果

In [59]:
inp = input('请输入热舒适度：0过冷 1冷 2舒适 3热 4过热')
label = int(inp)
label = pd.DataFrame(np.array([label]*temDistribution.shape[0]))
temDistribution_label = pd.concat([temDistribution, label], axis=1)
# 把温度分布和label列合并为一个df

请输入热舒适度：0过冷 1冷 2舒适 3热 4过热3


保存原始数据和处理后的特征数据

In [60]:
import time
df.to_csv("raw_data/data_{}_{}.csv".format(time.strftime("%Y%m%d_%H%M", time.localtime()), inp), mode='a', index=False, header=True)

In [65]:
# pd.read_csv("raw_data/data_1.csv")
pd.read_csv("raw_data/data_20190309_1008_3.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,26.87506,26.51291,27.15756,26.62268,27.06442,27.46872,29.36157,30.08496,29.73963,30.12613,...,35.8161,36.07593,35.53232,36.13458,36.27026,36.9578,33.95194,32.40799,30.84384,31.78369
1,27.25494,26.50464,26.05676,26.8558,27.38785,27.68478,28.17657,28.61163,30.23135,30.02164,...,35.80579,36.92401,36.20203,36.42963,36.1528,36.7366,32.53928,31.78748,30.69211,30.6712
2,26.8689,26.65448,26.17401,26.74829,26.26636,26.45593,26.20114,26.15668,26.37006,26.10404,...,35.61172,35.27771,32.05652,31.48602,30.07593,29.85034,28.60718,28.42551,28.66776,28.30533
3,27.24146,26.22723,26.40344,26.8299,26.47513,26.53235,26.40817,26.33594,27.48367,26.68039,...,36.31894,36.40784,34.88699,34.97943,33.86539,32.87775,31.79373,31.02652,30.52762,30.37903
4,26.96414,27.12607,26.14145,26.46396,26.56772,26.97613,26.93659,26.65335,27.66879,27.58392,...,36.22431,36.57785,35.08136,35.1629,33.86826,33.52353,30.72769,31.74475,30.53622,30.35895
5,27.07169,26.46295,25.99789,26.92734,27.20938,26.62021,26.80655,27.49027,28.05368,28.16745,...,36.11228,36.17786,35.55078,36.05093,34.70935,34.60196,31.65991,31.59973,30.38101,30.61191
6,27.3374,27.23874,26.97723,27.05103,26.55329,27.29599,27.24841,27.38669,28.76505,30.15332,...,36.3981,36.75305,36.03943,36.35892,34.6066,34.17169,31.42761,31.11798,29.97723,31.17572
7,27.44705,26.58447,26.71619,26.31653,26.53445,26.72403,26.68497,26.94766,27.63541,29.05447,...,36.47839,36.63968,36.21897,36.44037,35.87125,35.24561,31.88946,31.22177,30.64948,30.6041
8,26.26685,27.20407,26.09372,26.89627,25.9646,26.58945,26.8895,27.24649,27.82495,28.43732,...,36.27002,36.43597,36.29575,35.72235,36.27097,35.55569,31.74555,30.95819,30.3468,30.58505
9,26.40875,27.3465,26.09885,26.66241,26.18216,26.36331,26.33698,26.49094,26.99838,28.52515,...,36.06689,35.75308,36.08966,35.81171,36.58212,36.09015,33.15891,32.06753,30.50354,29.90689


In [61]:
# temDistribution_label.to_csv("test.csv", index=False,header=True)
temDistribution_label.to_csv("test_{}_{}.csv".format(time.strftime("%Y%m%d_%H%M", time.localtime()), inp), mode='a', index=False, header =True)

处理后的特征数据如下所示

In [64]:
pd.read_csv("test_20190309_1008_3.csv")

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p17,p18,p19,p20,p21,p22,p23,p24,ta,0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047368,0.068421,0.021053,0.110526,0.094737,0.173684,0.178947,0.263158,25.76224,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.04321,0.074074,0.067901,0.067901,0.111111,0.141975,0.209877,0.277778,25.96768,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044025,0.050314,0.037736,0.081761,0.100629,0.169811,0.207547,0.251572,25.54703,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032086,0.074866,0.058824,0.074866,0.13369,0.106952,0.208556,0.262032,25.68735,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.04023,0.068966,0.045977,0.132184,0.114943,0.114943,0.218391,0.235632,25.82947,3
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.020979,0.034965,0.090909,0.076923,0.125874,0.181818,0.230769,0.237762,26.12711,3
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0625,0.075,0.05,0.08125,0.1375,0.11875,0.2,0.275,25.97864,3
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025157,0.075472,0.069182,0.106918,0.119497,0.163522,0.169811,0.245283,25.80859,3
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.051546,0.061856,0.061856,0.061856,0.128866,0.149485,0.21134,0.226804,25.61929,3
