# Data analysis can be split into three elementary steps:
## 1. Importing the data
All the methods necessary to **read** the file contents, parse data and output it in desired format. Ie. something like that:
 
```
import pathlib
import pandas as pd

def data_import(self, path: pathlib.Path) -> pd.DataFrame:
    # ... 
    # dataframe =
    return dataframe
```

## 2. Processing the data
All the smoothing, normalization, interpolation, etc. should happen here. Implemented using composite design pattern.

## 3. Showing the data
ble ble

# Implementation using bridge design pattern/dependency injection:


In [5]:
import pathlib
from pathlib import Path
from IPython.display import display
from abc import ABC, abstractmethod
from typing import Callable, Optional, List
import pandas as pd
from pandas.errors import ParserError
import numpy as np
import itertools

class Handler(ABC):
    @abstractmethod
    def handle(self, *args, **kwargs):
        raise NotImplementedError("You should implement this!")
        
    def update_handler(self, new_handler: Callable) -> None:
        self.handle = new_handler
        
class FileHandler(Handler):
    def handle(self, path: pathlib.Path) -> pd.DataFrame:
        with open(path) as f:
            for i, row in enumerate(f):
                for delimiter, decimal in (itertools.product([';',',','\t'], ['.',','])):
                    try:
                        df = pd.read_csv(path ,delimiter=delimiter, decimal=decimal, header=i,on_bad_lines='skip', dtype=np.float64)
                        return df
                    except Exception:
                        pass
    
class Hitachi(Handler):
    def handle(self, path: pathlib.Path) -> pd.DataFrame:
        return pd.read_csv(path ,delimiter='\t',header=27,on_bad_lines='skip')

class Avantes(Handler):
    def handle(self, path: pathlib.Path) -> pd.DataFrame:
        return pd.read_csv(path,delimiter=';',decimal=',',header=5,on_bad_lines='skip')

class DataHandler(Handler):
    def __init__(self, handler: Optional[Callable]=None):
        super().__init__()
        
        if handler is not None:
            self.update_handler(handler)
        
    def handle(self,  data: pd.DataFrame) -> pd.DataFrame:
        return data

class Composite(DataHandler):
    def __init__(self) -> None:
        self.operations: List[DataHandler] = []
        
    def handle(self,  data: pd.DataFrame) -> pd.DataFrame:
        for o in self.operations:
            data = o.handle( data )
        return data

    def add(self, *operations: DataHandler) -> None:
        for o in operations:
            self.operations.append(o)

    def remove(self,*operations: DataHandler) -> None:
        for o in operations:
            self.operations.remove(o)
            
class DataModel(ABC):
    def __init__(self, path: pathlib.Path, reader: Optional[FileHandler]=FileHandler(), pipeline: Optional[DataHandler]=Composite()) -> None:
        self._reader=reader
        self._pipeline=pipeline
        self.path=path
    @property
    def path(self):
        return self._path
    
    @path.setter
    def path(self, path: pathlib.Path):
        self._path = path
    @property
    def raw_data(self):
        try:
            return self._reader.handle(self.path)
        except ParserError as e:
            display(repr(e))
            return pd.DataFrame()
    @property
    def data(self):
        return self._pipeline.handle(self.raw_data)
    
        
dm = DataModel(path=Path('../patka-pomiary/CPE45_PFOBPy_comocat.txt', reader=Hitachi()))
display(dm.data[:5])
dm._pipeline.add(DataHandler(handler=lambda df: df+5))
dm._pipeline.add(DataHandler(handler=lambda df: df+5))
dm._pipeline.add(DataHandler(handler=lambda df: df+5))
display(dm.data[:5])

dm2 = DataModel(path=Path('../patka-pomiary/func39.txt', reader=Avantes()))
display(dm2.data)
dm2._pipeline.add(DataHandler(handler=lambda df: df+5))
display(dm2.data)



Unnamed: 0,nm,Abs
0,1100.0,0.042
1,1099.0,0.042
2,1098.0,0.0419
3,1097.0,0.0418
4,1096.0,0.0417


Unnamed: 0,nm,Abs
0,1115.0,15.042
1,1114.0,15.042
2,1113.0,15.0419
3,1112.0,15.0418
4,1111.0,15.0417


Unnamed: 0,[nm],[counts],[counts] .1,[counts] .2
900.12,15.000,15.000,15.000,15.00000
903.33,6722.667,6407.333,6617.000,330.33333
906.55,6609.467,6431.933,6564.600,192.53333
909.76,11183.933,10940.533,11117.867,258.40000
912.97,11065.667,10822.867,11013.467,257.80000
...,...,...,...,...
1692.65,10393.667,10305.600,10318.933,103.06667
1695.73,10648.000,10537.600,10562.733,125.40000
1698.80,9681.400,9564.800,9603.533,131.60000
1701.87,10627.000,10462.667,10482.667,179.33333


Unnamed: 0,[nm],[counts],[counts] .1,[counts] .2
900.12,20.000,20.000,20.000,20.00000
903.33,6727.667,6412.333,6622.000,335.33333
906.55,6614.467,6436.933,6569.600,197.53333
909.76,11188.933,10945.533,11122.867,263.40000
912.97,11070.667,10827.867,11018.467,262.80000
...,...,...,...,...
1692.65,10398.667,10310.600,10323.933,108.06667
1695.73,10653.000,10542.600,10567.733,130.40000
1698.80,9686.400,9569.800,9608.533,136.60000
1701.87,10632.000,10467.667,10487.667,184.33333


In [53]:
import re
import numpy as np
import itertools

def force_open(path):
    with open(path) as f:
        for i, row in enumerate(f):
            for delimiter, decimal in (itertools.product([';',',','\t'], ['.',','])):
                try:
                    df = pd.read_csv(path ,delimiter=delimiter, decimal=decimal, header=i,on_bad_lines='skip', dtype=np.float64)
                    return df
                except Exception:
                    pass
            

df1=force_open(Path('../patka-pomiary/CPE45_PFOBPy_comocat.txt'))
df2=force_open(Path('../patka-pomiary/func39.txt'))

display(df1,df2)


Unnamed: 0,nm,Abs
0,1100.0,0.0420
1,1099.0,0.0420
2,1098.0,0.0419
3,1097.0,0.0418
4,1096.0,0.0417
...,...,...
696,404.0,0.0988
697,403.0,0.1040
698,402.0,0.1092
699,401.0,0.1146


Unnamed: 0,[nm],[counts],[counts] .1,[counts] .2
900.12,0.000,0.000,0.000,0.00000
903.33,6707.667,6392.333,6602.000,315.33333
906.55,6594.467,6416.933,6549.600,177.53333
909.76,11168.933,10925.533,11102.867,243.40000
912.97,11050.667,10807.867,10998.467,242.80000
...,...,...,...,...
1692.65,10378.667,10290.600,10303.933,88.06667
1695.73,10633.000,10522.600,10547.733,110.40000
1698.80,9666.400,9549.800,9588.533,116.60000
1701.87,10612.000,10447.667,10467.667,164.33333


In [1]:
import pandas as pd

pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 5f648bf1706dd75a9ca0d29f26eadfbb595fe52b
python           : 3.9.7.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19042
machine          : AMD64
processor        : AMD64 Family 23 Model 24 Stepping 1, AuthenticAMD
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : English_Europe.1252

pandas           : 1.3.2
numpy            : 1.21.2
pytz             : 2021.1
dateutil         : 2.8.2
pip              : 22.0.3
setuptools       : 57.4.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.4
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.1
IPython          : 7.27.0
pandas_datareader: None
bs4              : 4.9.3
bottleneck       : None
fsspec          