# Получение данных после введения метана (удаление точек до введения метана в камеру)

In [1]:
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt

## Настроить параметры в следующих 4-х блоках

Настройки обрезки данных (можно варьировать значения)

In [29]:
shift_minutes = 2 #добавка к найденному времени обрезки данных
dif_period = 5 #расстояние между сравниваемыми точками в штуках
dif_value = 0.05 #разница в ppm между сравниваемыми точками

Пути к референсным файлам (задать вручную все имена)

In [4]:
data = "2022-05-28" #название папки с референсными данными
referenceDataPath = "../Референсные_данные/{}".format(data) #путь до папки с референсными данными
referenceDataFile = "{}/micro_{}_f0000.txt".format(referenceDataPath, data) #название файла с сырыми референсными данными

Unnamed: 0,Time,[CH4]_ppm,[CH4]_ppm_sd,[CO2]_ppm,[CO2]_ppm_sd,[H2O]_ppm,[H2O]_ppm_sd,[CH4]d_ppm,[CH4]d_ppm_sd,[CO2]d_ppm,...,HZ_sd,Batt_v,Batt_v_sd,BattPer,BattPer_sd,Temp_Status_mA,Analyzer_Status_mA,Fit_Flag,MIU_VALVE,MIU_DESC
0,27/05/2022 11:23:27.290,28.0400,0.0,530.616,0.0,16596.1,0.0,28.5132,0.0,539.571,...,0.0,5.51041,0.0,116.488,0.0,20.0,20.0,2,3,
1,27/05/2022 11:23:28.284,28.2400,0.0,540.198,0.0,17699.5,0.0,28.7488,0.0,549.931,...,0.0,5.51045,0.0,116.491,0.0,20.0,20.0,2,3,
2,27/05/2022 11:23:29.279,28.3198,0.0,540.754,0.0,17740.0,0.0,28.8313,0.0,550.520,...,0.0,5.51043,0.0,116.490,0.0,20.0,20.0,2,3,
3,27/05/2022 11:23:30.273,28.2966,0.0,540.611,0.0,17721.9,0.0,28.8071,0.0,550.365,...,0.0,5.51030,0.0,116.482,0.0,20.0,20.0,2,3,
4,27/05/2022 11:23:31.268,28.3879,0.0,541.175,0.0,17709.4,0.0,28.8997,0.0,550.931,...,0.0,5.51044,0.0,116.490,0.0,20.0,20.0,2,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37320,27/05/2022 21:44:46.765,89.1507,0.0,424.069,0.0,18601.4,0.0,90.8405,0.0,432.107,...,0.0,5.50734,0.0,116.301,0.0,20.0,20.0,3,3,
37321,27/05/2022 21:44:47.764,89.1431,0.0,423.881,0.0,18631.1,0.0,90.8355,0.0,431.928,...,0.0,5.50730,0.0,116.299,0.0,20.0,20.0,3,3,
37322,27/05/2022 21:44:48.763,89.1300,0.0,423.995,0.0,18580.6,0.0,90.8175,0.0,432.022,...,0.0,5.50729,0.0,116.298,0.0,20.0,20.0,3,3,
37323,27/05/2022 21:44:49.763,89.1698,0.0,424.159,0.0,18595.8,0.0,90.8594,0.0,432.196,...,0.0,5.50735,0.0,116.302,0.0,20.0,20.0,3,3,



Задать вручную список из id серий с данными и имя папки, куда сложатся результаты обрезки 


In [22]:
seriesIds = [40, 41]
folder_name = "/processed_auto/" # имя папки с результатами
MS_DATA_path = "../../MS_DATA" # путь до папки с сериями

Название столбцов в файлах с данными (проверить)

In [40]:
ch4_name = '[CH4]d_ppm'
time_name = 'Time'
timestamp = 'timestamp'
voltage = 'V'

## Функции


Функции для интерполяции данных


In [41]:
def interpolation(t:np.ndarray, ch4:np.ndarray, t_new:np.ndarray) -> np.ndarray:
	if isinstance(t, np.ndarray) and isinstance(ch4, np.ndarray) and isinstance(t_new, np.ndarray):
		ch4_new = np.interp(t_new, t, ch4)
		return ch4_new
	return None
def convertTimestampsToFloats(times: np.ndarray) -> np.ndarray:
	return np.array([t.timestamp() for t in times])
def convertFloatsToTimestamps(times: np.ndarray):
    return dt.datetime.fromtimestamp(times)

Функция для конкатинации

In [42]:
def concatRefDataIntoFrame(dirPath):
	all_files = []
	if os.path.isdir(dirPath):
		all_files = glob.glob(dirPath + "/refData*.csv")
	else:
		all_files = [dirPath]
	li = []
	for filename in all_files:
		try:
			df = pd.read_csv(filename, delimiter=',', header=None)
			li.append(df)
		except Exception:
			print(f"Problems with file {filename}")
			continue
	frame = pd.concat(li, ignore_index=True)
	return frame

Функция для построения графиков

In [46]:
def plotRefAndMeasure(x_ref, y_ref, x, y, title):
	figure, axis = plt.subplots(2, 1, sharex=True, figsize=(16,10), dpi = 300)
	axis[0].scatter(x_ref, y_ref)
	axis[0].set_title("Газоанализатор ABB (эталонный прибор)")
	axis[0].set_ylabel('Содержание метана, ppm')
	axis[1].scatter(x, y)
	axis[1].set_title("Калибруемый датчик")
	axis[1].set_ylabel('Напряжение, В')
	figure.suptitle(title)
	return figure

## Обрезка и сохранение данных

In [44]:
for seriesId in seriesIds:
	try:
		seriesDataTemplate = f"{MS_DATA_path}/series{seriesId}_*"
		seriesDataPath = glob.glob(seriesDataTemplate)[-1]
		seriesName = os.path.splitext(os.path.basename(seriesDataPath))[0]

		if os.path.isdir(seriesDataPath):
			measurePathes = glob.glob(seriesDataPath + "/measure*.csv")
			measurementsCount = len(measurePathes)

		dirName = seriesDataPath + folder_name

		if not os.path.exists(dirName):
			os.makedirs(dirName)
			print("Directory " , dirName ,  " Created ")
		# Чтение референсных данных в DataFrame
		df_ref = pd.read_csv(referenceDataFile, delimiter=',', skiprows=[0], skipinitialspace=True)
		df_ref = pd.DataFrame(df_ref.loc[:,[time_name, ch4_name]])
		df_ref[time_name] = pd.to_datetime(df_ref.Time, format="%d/%m/%Y %H:%M:%S.%f")
		df_ref.set_axis([timestamp, ch4_name], axis='columns', inplace=True)

		for measureId in range(1, measurementsCount + 1):
			try:
				measureTemplate = seriesDataPath + f"/measure{measureId}_*.csv"
				measureDataPath = glob.glob(measureTemplate)[0]
				csv_name = os.path.basename(measureDataPath)
				measure_name = os.path.splitext(csv_name)[0]
				# Чтение данных в DataFrame
				df_data = pd.read_csv(measureDataPath, delimiter=',')
				df_data[timestamp] = pd.to_datetime(df_data.timestamp)
				# df_data[timestamp] = df_data[timestamp] + pd.Timedelta(seconds=104) # Исправление сдвига времени между датчиком и эталонным прибором
				start = df_data.loc[0, timestamp]
				stop = df_data[timestamp].iloc[-1]
				refData = df_ref.drop(df_ref[(df_ref.timestamp < start) | (df_ref.timestamp > stop)].index, inplace=False)
				refData.reset_index(drop=True, inplace = True)
				# Построение и сохранение графика исходных данных
				figure = plotRefAndMeasure(refData[timestamp], refData[ch4_name], df_data[timestamp], df_data[voltage],f"Необработанные данные измерения №{measureId} из серии №{seriesId}")
				image_path = measure_name +'_row.png'
				figure.savefig(dirName + image_path)
				plt.close()
				plt.ioff()
				# Интерполяция референсных данных
				ref_timestamps = refData[timestamp].to_list()
				data_timestamps = df_data[timestamp].to_list()
				ch4_reference = refData[ch4_name].to_numpy()
				ref_unix_times = [i.timestamp() for i in ref_timestamps]
				unix_times = [i.timestamp() for i in data_timestamps]
				interploated_ch4_reference = interpolation(t=np.array(ref_unix_times), t_new=np.array(unix_times), ch4=ch4_reference)
				refData_interpolated = pd.DataFrame(list(zip(data_timestamps, interploated_ch4_reference)), columns =[timestamp, ch4_name])
				df = pd.DataFrame(list(zip(data_timestamps, interploated_ch4_reference)), columns =[timestamp, ch4_name])
				# Расчет разницы между значениями интерполированных данных
				df["Difference"] = df[ch4_name].diff(periods=dif_period)
				df_droped = df.drop(df[(df.Difference.abs() < dif_value)].index, inplace=False)
				# Последнее время с большой производной
				point = df_droped[timestamp].iloc[-1] + pd.Timedelta(minutes=shift_minutes)
				# Отбрасывание данных, которые были раньше введения метана
				refData_interpolated.drop(refData_interpolated[(refData_interpolated.timestamp <= point)].index, inplace=True)
				refData_interpolated.reset_index(drop=True, inplace = True)
				df_data.drop(df_data[(df_data.timestamp <= point)].index, inplace=True)
				df_data.reset_index(drop=True, inplace = True)
				# Построение графика после обрезки
				figure = plotRefAndMeasure(refData_interpolated[timestamp], refData_interpolated[ch4_name], df_data[timestamp], df_data[voltage],f"Обрезанные данные измерения №{measureId} из серии №{seriesId},")
				image_path = measure_name +'.png'
				figure.savefig(dirName + image_path)
				refData_interpolated.to_csv(dirName + '/refData_{}.csv'.format(measure_name), header=None, index=None) 
				df_data.to_csv(dirName + measure_name + ".csv", index=None)
				plt.close()
				plt.ioff()
				# Добавление референсных данных в общий файл всей серии
				refSeriesFile = dirName + "/referenceData_{}.csv".format(data)
				seriesRefDf = concatRefDataIntoFrame(dirName)
				seriesRefDf.set_axis([timestamp, ch4_name], axis='columns', inplace=True)
				seriesRefDf.sort_values(by=[timestamp], inplace=True)
				seriesRefDf = seriesRefDf.round({ch4_name: 5})
				seriesRefDf.to_csv(dirName + "/referenceData_{}.csv".format(seriesName), header=None, index=None)
				# Сохранение референсных данных в общий файл в папку с сырыми референсными данными для всего дня
				endRefPath = referenceDataPath + "/referenceData_{}.csv".format(data)
				if os.path.exists(endRefPath):
					df = pd.read_csv(endRefPath, delimiter=',', header=None)
					df.set_axis([timestamp, ch4_name], axis='columns', inplace=True)
					finalDf = pd.concat([seriesRefDf, df], ignore_index=True)
					finalDf.sort_values(by=[timestamp], inplace=True)
					finalDf = finalDf.round({ch4_name: 5})
					finalDf.to_csv(endRefPath, header=None, index=None)
				else:
					seriesRefDf.to_csv(endRefPath, header=None, index=None)
			except Exception as e:
				print(measureId, e)
				continue
	except Exception as e:
		print(seriesId, e)
		continue
