In [1]:
import pandas as pd

Dataset filename .csv decoding

e.g. **Faults_Cleaning_Counters_100_D_20_1.csv**

Field | Description
------|-----------------
Faults_Cleaning_Counters | what is in the datset
100 |  number of machines Serial Numbers in the datset. 
D | time period of summarization D=Days W=Weeks
10 | depth of each row in periods
1 | time periods between two consecutive rows for the same Serial Number

Read the dataset

In [35]:
dataset_root = "Faults_Cleaning_Counters_100_D_20_1"

In [36]:
df = pd.concat([pd.read_csv(dataset_root+"_0000.csv",parse_dates = ['Target Timestamp']),
                pd.read_csv(dataset_root+"_0001.csv",parse_dates = ['Target Timestamp']),
                pd.read_csv(dataset_root+"_0002.csv",parse_dates = ['Target Timestamp']),
                pd.read_csv(dataset_root+"_0003.csv",parse_dates = ['Target Timestamp'])])

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62300 entries, 0 to 15574
Columns: 605 entries, Unnamed: 0 to TARGET
dtypes: datetime64[ns](1), float64(601), int64(2), object(1)
memory usage: 288.0+ MB


## Dataset rearranging

The dataset comes with the TARGET column being the number of faults in the target day. I can make the prediction more general (and more useful) builind a target as the occurrence of a fault in one of the last N days.

To keep the preprocessing general I get the information about the categories and the time period from the data itself

In [41]:
#List of root names of categories
#For each column name, get the ones with a '-' in the name, take the part of the names before the '-' and make it a set.
categories_roots = list(set([s.split('-')[0] for s in df.columns if '-' in s]))
print(categories_roots)

['numacqua', 'numcaffegenerale', 'ngr1', 'ngr4', 'numcaffegr3', 'nummac2gr1', 'numcaffegr1', 'portatagr1', 'tempogr4', 'numcaffegr2', 'ngr2', 'numcioccolato', 'tempogr2', 'numsolubile', 'ngr3', 'numcicligr1', 'numlattefr', 'tempogr1', 'numlattegr1', 'nummac1gr1', 'FAULT', 'portatagr2', 'CLEANING', 'numvapore', 'tempogr3', 'numvaporeariats', 'portatagr4', 'numcaffegr4', 'numvaporets', 'portatagr3']


In [42]:
#List of root names of categories
#For each column name, get the ones with a '-' in the name, take the part of the names After the '-' and make it a set.
time_period=list(set([s.split('-')[1][-1] for s in df.columns if '-' in s]))[0]
print(time_period)

D


Timestamp column is in pd.Timestamp type - transform it in a cardinal number starting from the first date of the dataset

In [49]:
df['Target Timestamp'].min()
df['Target day'] = (df['Target Timestamp']-df['Target Timestamp'].min()).apply(lambda d: d.days)

## Prediction width parameter

How many days to aggregate to build the TARGET

In [52]:
# Target of the prediction is the aggregation of N days
target_width = 5

In [53]:
columns_to_sum = ['FAULT-%d%s'%(n,time_period) for n in range(target_width-1,0,-1)]
columns_to_sum.append('TARGET')
columns_to_sum

['FAULT-4D', 'FAULT-3D', 'FAULT-2D', 'FAULT-1D', 'TARGET']

I have to drop the columns relevant to the period I want to predict - all type of columns, not just faults

In [54]:
from itertools import chain
columns_to_drop = [['%s-%d%s'%(root,n,time_period) for n in range(target_width-1,0,-1)] for root in categories_roots]
columns_to_drop = list(chain(*columns_to_drop))
#columns_to_drop


Build the new TARGET column

In [55]:

df['TARGET']=df[columns_to_sum].sum(axis=1)
# For now only boolean prediction
df['TARGET'] = (df['TARGET']>0).astype(int)


Check distribution of target values

In [56]:
df['TARGET'].value_counts()

0    40929
1    21371
Name: TARGET, dtype: int64

In [57]:
print(df.columns)

Index(['Unnamed: 0', 'Serial', 'Model', 'Target Timestamp', 'CLEANING-20D',
       'CLEANING-19D', 'CLEANING-18D', 'CLEANING-17D', 'CLEANING-16D',
       'CLEANING-15D',
       ...
       'tempogr4-8D', 'tempogr4-7D', 'tempogr4-6D', 'tempogr4-5D',
       'tempogr4-4D', 'tempogr4-3D', 'tempogr4-2D', 'tempogr4-1D', 'TARGET',
       'Target day'],
      dtype='object', length=606)


Remove unwanted columns either because part of the target days or because useless (Serial, Timestamp, Unnamed)

In [58]:
columns_to_drop.extend(['Unnamed: 0','Target Timestamp','Serial'])
df = df.drop(columns_to_drop,axis=1)

Make the "Model" column Categorical

In [60]:
df['Model'] = df['Model'].astype('category')

import Machine Learning stuff

In [61]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.ensemble import RandomForestClassifier

from datetime import datetime
import timeit

Light GBM classifier is used to allow eventually the uasge of Grid Search for params tuning

In [62]:
# Use lgbm_classifier to use Gridsearch
def lgbm_classifer(df,learning_rate = 0.1, num_leaves = 31,n_estimators=100,num_iterations=100,verbose=False):

    X = df.drop('TARGET',axis=1)
    y = df['TARGET'].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

    lgbmc = lgb.LGBMClassifier(num_leaves     = num_leaves,
                               n_estimators   = n_estimators,
                               learning_rate  = learning_rate,
                               num_iterations = num_iterations,
                               objective      = 'binary',
                               boosting_type  = 'gbdt' ,
                               is_unbalance   = True,
                               metric         = ['binary_logloss'])

    start=timeit.default_timer()
    lgbmc.fit(X_train,y_train,
                   #categorical_feature =[0],
                   feature_name = list(X.columns))
    stop=timeit.default_timer()
    
    prob_pred = lgbmc.predict(X_test)
    predictions = (prob_pred > 0.5).astype(int)
    
    cm = confusion_matrix(y_test,predictions)
    f1 = f1_score(y_test,predictions)
    if verbose:
        print("Fitting time: {} s".format(stop-start))
        print("Classification Report")
        print(classification_report(y_test,predictions))
        print("Confusion Matrix")
        print(cm)
        print('F1:',f1)

    feature_importance = pd.DataFrame({'f_name':list(X.columns),'importance':lgbmc.feature_importances_}).dropna()
    feature_importance = feature_importance[feature_importance['importance']>0].\
            sort_values('importance',ascending=False).set_index('f_name')['importance'][:30]

    return(f1,feature_importance,lgbmc)   
 

In [63]:
f1,ranks,classifier = lgbm_classifer(df.drop('Target day',axis=1),num_leaves  = 350,
                           n_estimators   = 100,
                           learning_rate  = 0.01,
                           num_iterations = 100,
                           verbose=True)

print("F1:",f1)
print(ranks)


Fitting time: 11.119897696969701 s
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     13397
           1       0.82      0.86      0.84      7162

   micro avg       0.89      0.89      0.89     20559
   macro avg       0.87      0.88      0.88     20559
weighted avg       0.89      0.89      0.89     20559

Confusion Matrix
[[12095  1302]
 [ 1025  6137]]
F1: 0.840627354290802
F1: 0.840627354290802
f_name
FAULT-5D                559
FAULT-7D                538
FAULT-6D                533
Model                   471
FAULT-11D               469
FAULT-17D               448
FAULT-20D               433
FAULT-18D               429
FAULT-10D               409
FAULT-8D                393
numcaffegenerale-5D     392
FAULT-9D                390
numcaffegenerale-8D     382
FAULT-16D               375
FAULT-15D               362
numacqua-15D            357
numcaffegenerale-19D    352
numcaffegr1-17D         352
numcaffegener