# d_summary
Algorithm that does some vary basic validation of the remote dataset. Which includes:
* Check header labels
* Count the number of rows
* Average of `int64` and `float64` columns
* Categories of `category` columns

Limits of the algorithm:
* The entire dataset needs at least 10 records.
* `category` columns needs at least 2 different values

## 1. Algorithm

In [39]:
import pandas
import json

### input.txt
The input.txt is mounted by the docker-container, and contains input to the algorithm.

The input for this algorithm include the method name that is called in the docker-container `summarize` and a `dict` containing column names and dtypes. The allowed types are: `object`, `int64`, `float64`, `bool`, `datetime64`, `category`

In [40]:
input_ = {
    "method":"summarize", 
    "columns":{
        "patient_id": 'Int64',
        "age": 'Int64',
        "weight": 'float64', 
        "stage": 'category',
        "cat": 'category',
        "hot_encoded": 'Int64'
    }
}

### database.csv
The database csv-file is mounted in the docker-container.

In [41]:
%pwd

'C:\\Users\\FMa1805.36838\\Repositories\\dSummary'

In [42]:
dataframe = pandas.read_csv("./local/database.csv", sep=";",decimal=",", dtype=input_.get("columns"))
dataframe

Unnamed: 0,patient_id,age,weight,stage,cat,hot_encoded
0,1,41,73.2,IV,Q,1.0
1,2,37,65.9,I,Q,
2,3,45,84.1,II,Q,0.0
3,4,47,83.1,II,Q,0.0
4,5,33,,,Q,1.0
5,6,34,,,Q,1.0


In [43]:
dataframe.dtypes

patient_id        Int64
age               Int64
weight          float64
stage          category
cat            category
hot_encoded       Int64
dtype: object

### algorithm.py

In [44]:
# retrieve column names from the dataset
columns_series = pandas.Series(data=input_.get("columns"))
column_names = list(columns_series)

# compare column names from dataset to the input column names
column_names_correct = column_names == list(input_.get("columns").keys())
print(f"column_names_correct={column_names_correct}")

column_names_correct=False


In [45]:
# count the number of rows in the dataset
number_of_rows = len(dataframe)
print(f"number_of_rows={number_of_rows}")

number_of_rows=6


In [46]:
# compute the avarage of the numeric columns
columns = {}
numeric_colums = columns_series.loc[columns_series.isin(['Int64','float64'])]

for column_name in numeric_colums.keys():
    column_values = dataframe[column_name]
    q1, median, q3 = column_values.quantile([0.25,0.5,0.75]).values
    mean = column_values.mean()
    minimum = column_values.min()
    maximum = column_values.max()
    nan = column_values.isna().sum()
    columns[column_name] = {
        "min": int(minimum),
        "q1": int(q1),
        "median": int(median),
        "mean": int(mean),
        "q3": int(q3),
        "max": int(maximum),
        "nan": int(nan)
    }
pandas.DataFrame.from_dict(columns,orient='index')

Unnamed: 0,min,q1,median,mean,q3,max,nan
age,33,34,39,39,44,47,0
hot_encoded,0,0,1,0,1,1,1
patient_id,1,2,3,3,4,6,0
weight,65,71,78,76,83,84,2


In [47]:
dataframe["stage"].cat.categories

Index(['I', 'II', 'IV'], dtype='object')

In [48]:
dataframe["stage"].value_counts().to_dict()

{'II': 2, 'IV': 1, 'I': 1}

In [49]:
# return the categories in categorial columns
categoral_colums = columns_series.loc[columns_series.isin(['category'])]

for column_name in categoral_colums.keys():
    
    columns[column_name] = dataframe[column_name].value_counts().to_dict()
    
    display(pandas.Series(columns[column_name]))

II    2
IV    1
I     1
dtype: int64

Q    6
dtype: int64

In [50]:
output = {
    "column_names_correct": column_names_correct,
    "number_of_rows": number_of_rows,
    "statistics": columns
}
output

{'column_names_correct': False,
 'number_of_rows': 6,
 'statistics': {'patient_id': {'min': 1,
   'q1': 2,
   'median': 3,
   'mean': 3,
   'q3': 4,
   'max': 6,
   'nan': 0},
  'age': {'min': 33,
   'q1': 34,
   'median': 39,
   'mean': 39,
   'q3': 44,
   'max': 47,
   'nan': 0},
  'weight': {'min': 65,
   'q1': 71,
   'median': 78,
   'mean': 76,
   'q3': 83,
   'max': 84,
   'nan': 2},
  'hot_encoded': {'min': 0,
   'q1': 0,
   'median': 1,
   'mean': 0,
   'q3': 1,
   'max': 1,
   'nan': 1},
  'stage': {'II': 2, 'IV': 1, 'I': 1},
  'cat': {'Q': 6}}}

### output.txt

In [51]:
with open("./local/output.txt", "w") as fp:
    json.dump(output,fp)