In [1]:
import os
import pandas as pd

In [2]:
path_to_data = 'dane/zadanie2/'

class Parser:
    """path_to_data -> path to folder, where files are segragated by date
    
    example:
    
    >>> path_to_data = 'zadanie2/'
    >>> os.listdir(path_to_data)
    ['2021-04-19', '2021-05-08', '2021-05-27']
    
    """
    def __init__(self, path_to_data):
        self.path = path_to_data
        self.tempdata_path = 'dane/tempdata.csv'
        self.variabletypes = ['manipulowane', 'straty', 'zaklocajace', 'zaklocane']
        self.data = None
    
    def get_file(self, path_to_file):
        df = pd.read_csv(path_to_file)
        try:
            df.index = pd.to_datetime(df.Czas)
        except:
            df = df.rename({'czas': 'Czas'}, axis=1)
            df.index = pd.to_datetime(df.Czas)
        
        df = df.drop('Czas', axis=1)
        
        if 'Unnamed: 5' in df.columns:
            df = df.drop('Unnamed: 5', axis=1)

        return df
        
    def merge_data2(self, data, data2):
        return pd.concat([data, data2], join='outer')
        
    
    def data_join(self, frames):
        data = frames[0]
        for i in range(1,len(frames)):
            data = pd.merge(data, frames[i], on='Czas', how='inner')
            
        self.data = data
        
    def get_data(self):
        return self.data    
        
    def parse(self):
        dates = os.listdir(self.path)
        i = 0
        frames = []
        for variable in self.variabletypes:
            data = pd.DataFrame([])
            print(variable)
            for date in dates:
                files = os.listdir(self.path+date)
                for file in files:
                    if variable in file:
                        path_to_file = os.path.join(self.path, date, file)
                        data2 = self.get_file(path_to_file)
                        data = self.merge_data2(data, data2)
                        i += 1

                print('date {} processed'.format(date))
            
            frames.append(data)
            print('--------------------------')
        print('processed data: {}'.format(i))
        
        self.data_join(frames)
        

In [3]:
parser = Parser(path_to_data)
parser.parse()

manipulowane
date 2021-04-19 processed
date 2021-05-08 processed
date 2021-05-27 processed
--------------------------
straty
date 2021-04-19 processed
date 2021-05-08 processed
date 2021-05-27 processed
--------------------------
zaklocajace
date 2021-04-19 processed
date 2021-05-08 processed
date 2021-05-27 processed
--------------------------
zaklocane
date 2021-04-19 processed
date 2021-05-08 processed
date 2021-05-27 processed
--------------------------
processed data: 464


In [4]:
data = parser.get_data()
data

Unnamed: 0_level_0,001FCx00285_SPPV.PV,001XXXCALC01.NUM.PV[3],001SCx00274_SPPV.PV,001FCx00241_sppv.pv,001NIR0SZR0.daca.pv,001NIR0SZRG.daca.pv,001NIR0S600.daca.pv,001NIR0S500.daca.pv,001NIR0S300.daca.pv,001NIR0S100.daca.pv,...,001XXXCALC01.NUM.PV[2],prob_corg,prob_s,sita_nadziarno,sita_podziarno,poziom_zuzel,001UCx00274.pv,001NIR0ODS0.daca.pv,temp_zuz,007SxR00555.daca1.pv
Czas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-19 00:00:00,2700.0,81.0,50.0,31.0,23.298206,0.870960,1.513844,7.206612,7.292219,6.445967,...,303.0,8.61,11.27,2.10,62.000000,1250,13.727884,9.327152,1306,-0.128922
2021-04-19 00:00:01,2700.0,81.0,50.0,31.0,23.303154,0.870934,1.513818,7.207745,7.292501,6.446149,...,303.0,8.61,11.27,2.34,62.799999,1360,13.726639,9.329683,1305,-0.328362
2021-04-19 00:00:02,2700.0,81.0,50.0,31.0,23.308102,0.870908,1.513792,7.208877,7.293985,6.446331,...,303.0,8.61,11.27,2.34,62.799999,1360,13.725391,9.332214,1305,-0.113587
2021-04-19 00:00:03,2700.0,81.0,50.0,31.0,23.313053,0.870881,1.513766,7.210010,7.295469,6.446513,...,303.0,8.61,11.27,2.34,62.799999,1360,13.724146,9.334154,1305,0.101188
2021-04-19 00:00:04,2700.0,81.0,50.0,31.0,23.318001,0.870855,1.513740,7.211143,7.296953,6.446695,...,303.0,8.61,11.27,2.34,62.799999,1360,13.722900,9.335503,1305,-0.098252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-06 22:49:55,3000.0,72.0,50.0,24.0,17.705795,0.858735,1.462337,4.117548,5.318237,5.966559,...,276.0,8.81,10.18,3.32,59.400002,1180,17.086479,10.113976,1301,0.622794
2021-06-06 22:49:56,3000.0,72.0,50.0,24.0,17.707394,0.858851,1.462500,4.117713,5.318606,5.966569,...,276.0,8.81,10.18,3.32,59.400002,1180,17.096081,10.115042,1301,0.684169
2021-06-06 22:49:57,3000.0,72.0,50.0,24.0,17.708994,0.858968,1.462663,4.117879,5.318974,5.966580,...,276.0,8.81,10.18,3.32,59.400002,1180,17.105686,10.116107,1301,0.711000
2021-06-06 22:49:58,3000.0,72.0,50.0,24.0,17.710592,0.859084,1.462826,4.118045,5.319343,5.966590,...,276.0,8.81,10.18,3.32,59.400002,1180,17.115292,10.117172,1301,0.581000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3954000 entries, 2021-04-19 00:00:00 to 2021-06-06 22:49:59
Data columns (total 26 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   001FCx00285_SPPV.PV     float64
 1   001XXXCALC01.NUM.PV[3]  float64
 2   001SCx00274_SPPV.PV     float64
 3   001FCx00241_sppv.pv     float64
 4   001NIR0SZR0.daca.pv     float64
 5   001NIR0SZRG.daca.pv     float64
 6   001NIR0S600.daca.pv     float64
 7   001NIR0S500.daca.pv     float64
 8   001NIR0S300.daca.pv     float64
 9   001NIR0S100.daca.pv     float64
 10  001FYx00206_SPSUM.pv    float64
 11  001FCx00231_SPPV.PV     float64
 12  001FCx00251_SPPV.PV     float64
 13  001FCx00281.PV          float64
 14  001FCx00262.PV          float64
 15  001FCx00261.PV          float64
 16  001XXXCALC01.NUM.PV[2]  float64
 17  prob_corg               float64
 18  prob_s                  float64
 19  sita_nadziarno          float64
 20  sita_podziarno          float64
 21

In [6]:
data.to_csv('dane/processed_data_zad2.csv')