# Setting up a proof-of-concept (POC) machine learning model from scratch

### Task

Create a machine learning model capable of predicting the correct validity of a meter reading. The model will be user to take over decision-making currently applied by a user. 

In [1]:
import pandas
import numpy as np
import datetime

pandas.options.mode.chained_assignment = None

## Solution

### 1. Conception

- Answer questions:
    - How is the current process of validity check by the user?
    - Which **data** does the user use to make the decision?
    - Where do I find this **data**?

### 2. Acquire Data

- Get access to DBs providing **data**
- Write SQLs to access **data**

In [2]:
# Here only single csv: In reality as set of DB resources
data = pandas.read_csv("./data/readings.csv", index_col=0) \
             .sort_values(by="readAt")

In [3]:
data

Unnamed: 0,contractId,valid,validityChangedAt,readAt,value,priority,qualifier,origin,createdAt,reason,param,code,counter
6164,4443,0,2020-01-07 13:38:34,2018-09-25 00:00:00,12496.0,2,read,customer,2018-09-25 18:41:11,,,1-1:1.8.0,7796478
9487,9857,0,2018-10-18 11:50:06,2018-10-01 00:00:00,36.0,3,estimated,vnb,2018-10-18 11:50:06,COS,SMV,7-20:3.0.0,89913
9359,5905,1,2018-10-08 06:30:07,2018-10-01 00:00:00,22925.0,1,read,vnb,2018-10-08 06:30:07,COS,SMV,7-20:3.0.0,3322005
9354,393,0,2018-11-14 16:25:32,2018-10-01 00:00:00,32376.0,3,estimated,vnb,2018-11-14 16:25:32,COS,SMV,1-1:1.8.0,470000340043
9353,393,0,2018-11-13 06:05:24,2018-10-01 00:00:00,32376.0,3,estimated,vnb,2018-11-13 06:05:24,COS,SMV,1-1:1.8.0,470000340043
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,3918,0,2020-03-27 09:11:56,2020-03-26 23:59:59,12057.6,1,read,msb,2020-03-27 09:11:56,PMR,MRV,1-1:1.8.0,1APADA90917567
13637,4159,1,2020-03-27 09:07:24,2020-03-26 23:59:59,2561.0,1,read,msb,2020-03-27 09:07:24,PMR,MRV,1-0:1.8.0,1LOG0065083099
9019,6111,1,2020-03-27 09:24:57,2020-03-27 23:59:59,44501.0,2,read,customer,2020-03-27 09:24:57,,,1-1:1.8.0,4100186942
11165,3227,0,2020-03-28 00:10:37,2020-03-27 23:59:59,33260.0,1,read,vnb,2020-03-28 00:10:37,PMR,MRV,7-20:3.0.0,889902


### 3. Analyze data

- What is the meaning of individual columns?
- Columns suitable for decision-making? (e.g. too many na-values bad)
- Is **data** assumed to be sufficient? If not, start over with **1. Conception**

In [None]:
# E.g. check total number of valid / invalid readings, 1 or 0, respectively. 
# [TASK]: Check number of valid / invalid entries in data
...

### 4. Data preprocessing

#### 4.1 Data Aggregation Strategy

How to group **data** belonging together?

In [None]:
# Group by contract, code and counter
# [TASK]: Define grouper list based on items belonging together
grouper = [...]
select = [column for column in data if not column in grouper]

In [None]:
aggregated = list(group[select] for context, group in data.groupby(grouper))

In [None]:
aggregated[102]

#### 4.2 Structure and clean data

- Structure **data** so you have a clear view how to clean it
- Clean data: Remove insufficiencies

In [None]:
# Decompose past readings from readings for assessment
past = []
assess = []
for x in aggregated:
    
    # Last item of the row
    assess_ = x.iloc[-1]
    
    # Append n-1 rows from group
    past_ = x.iloc[0:-1]
    
    # [TASK] : Mask data not available @ assess_["createdAt"]
    # Hmm ... unfortunately if have to drop some values
    # that have not been available @ decision making time
    ...
    
    # -> Problem with DB updates! ...
    
    past.append(past_)
    assess.append(assess_)
    
past[102]

In [None]:
X = [] # Features for predicting
y = []

select.remove("valid")
for past_, assess_ in zip(past, assess):
    X.append(assess_[select].tolist() + past_[::-1].values.flatten().tolist())
    y.append(assess_["valid"])

In [None]:
# Get matrix shape of X: padding of individual # of past items
n_features = 3 * (len(select) + 1) + len(select)
print(n_features)

#### 4.3 Create training data

- Decompose data into feature matrix X and target vector y

In [None]:
# Feature matrix needs to be 2D in this case. Since # of past readings varies,
# some data points need to be dropped, some other need to be padded (with na)
Xout = []
for Xi in X:
    
    n = len(Xi)

    # [TASK] : Modify elements in Xi such that list have n_features elements
    ...
    
    Xout.append(Xi)

# Feature matrix: Features characterizing the past reading history
X = pandas.DataFrame(Xout) 
# Target vector: Binary vector (1 -> valid, 0 -> invalid)
y = np.array(y)

### 5. Preparation for machine learning: Normalization

- Conversion: Features must be floats. Think of how to convert
    - dates
    - strings

In [None]:
# Use scikit-learn: Library containing a greate number of ML utilities
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Criterion for filtering float-like columns from x
def is_float_convertable(x):
    
    try: 
        x.astype(float)
    except ValueError:
        return False
    
    return True

# Criterion for filtering date-like columns from x
def is_datelike(x):
    
    try:
        pandas.to_datetime(x)
        
        if not is_float_convertable(x):
            return True
    except:
        return False
    
    return False

In [None]:
# Obviously, we have multi-type data available. All types have to be converted into float. 
# For converting categorical data, there are special encoding methodes available. 

# Decompose data by type
numerical = [column for column in X if is_float_convertable(X[column])]
dates = [column for column in X if is_datelike(X[column])]
strings = [column for column in X if not column in numerical + dates]

# [TASK] Convert dates to float: Total seconds since millenium
null_date = datetime.datetime(2000, 1, 1)
...
    
# Convert str columns: One-Hot-Encoding
Xstr = X[strings].fillna("nan")
Xstr = pandas.DataFrame(OneHotEncoder(sparse=False) \
             .fit_transform(Xstr))
X.drop(columns=strings, inplace=True)
X.columns = range(len(X.columns))
string_columns = np.arange(max(X.columns) + 1, (max(X.columns) + Xstr.shape[1] + 1))
X[string_columns] = Xstr

- na-fill strategy: Imputation

In [None]:
X = SimpleImputer().fit_transform(X)

- Scaling: Normalize **data** features, such that each have similar impact, e.g. (-1, 1) normalization of features

In [None]:
X = RobustScaler().fit_transform(X)

### 6. Model Training

- Select suitable algorithm
- Test if training technically works

In [None]:
# Use scikit-learn: Library containing a greate number of ML utilities
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

In [None]:
# Init classifier
tree = DecisionTreeClassifier()

# Data splitting: Given subset of X, train to be able to predict associated subset y
# [TASK] : Train using the first 2300 data points ...
tree.fit(X[...], y[...])

In [None]:
# Classifier prediction for data NOT used for training
# [TASK] ... and predict target for the res
pred = tree.predict(X[...])
true = y[...]

In [None]:
pred

In [None]:
true

- Measure quality of model: Precision and Confusion matrix

In [None]:
precision_score(pred, true)

In [None]:
confusion_matrix(pred, true, labels=[1, 0])

### 7. Systematic optimization

- Algorithms have parameters to be choosen by user: Apply optimization
- Split data systematically among different configurations and select the "best" model (requires definition of metric) 

In [None]:
# Parameters taken by DecisionTree classifier
params = {"max_depth" : [None, 5, 10, 20, 50],
          "min_samples_split" : [2, 5, 10],
          "max_features" : ["auto", "sqrt", "log2"]}

# Create data split strategy
cv = KFold(5, random_state=42, shuffle=True)

# Init grid search for optimum parameters
grd = GridSearchCV(tree, params, cv=cv, scoring="precision")

# Train on all possible combinations of parameters
grd.fit(X, y)

In [None]:
# Best classifier
clf = grd.best_estimator_

# Total precision
grd.best_score_

In [None]:
# Compute total score
confusion = []
for train, test in cv.split(X): # Provides arrays of indices
    
    # [TASK] : "fit clf" using train and "pred" using test indices
    clf.fit(X[...], y[...])
    pred = clf.predict(X[...])
    confusion.append(confusion_matrix(pred, y[...], labels=[1, 0]))
    
confusion = np.array(confusion).sum(axis=0)
confusion

### 8. Bring to application

- Transformer implement: Implement custom preprocessing into transformer class object
- Transformer chain: Chain all processing and classification items together

In [None]:
# Transformer object: Steps 4. and 5. in one class 
from utils import CustomPreprocessing

In [None]:
# Transformer chain
chain = Pipeline([("custom", CustomPreprocessing()),
                  ("fillna", SimpleImputer()),
                  ("scale", RobustScaler()),
                  ("clf", DecisionTreeClassifier(**clf.get_params()))])

In [None]:
# Train on complete data set
chain.fit(data, y)

In [None]:
# Access data from application:
file = 1 # Choose from 1 or 2
# Two counters belonging to a single contract
appl_data = pandas.read_csv(f"./data/readings_application_{file}.csv", index_col=0) \
             .sort_values(by="readAt") \
            [data.columns]

In [None]:
appl_data

In [None]:
# Predict validity of readings with valid == nan
# [TASK] : Apply predict on appl_data
pred = chain.predict(...)

pred