In [2]:
# connecting to the database
%run '00_database_connectivity_setup.ipynb'





In [6]:
%%execsql
DROP TABLE IF EXISTS  nate.bic_azpp_o0200_sum

First we generate a summary of the table using the MADlib library provides to us through HAWQ, our primary database for data science purposes.

In [7]:
%%showsql
select madlib.summary('sap_bi.bic_azpp_o0200_int', 'nate.bic_azpp_o0200_sum');

Unnamed: 0,summary
0,"(nate.bic_azpp_o0200_sum,229,339.746952057)"


In [3]:
%%showsql
select count(*) as numberofrows from sap_bi.bic_azpp_o0200_int;
# this is a large table

Unnamed: 0,numberofrows
0,646665


This cell uses the output from the MADlib summary to filter out all of the columns from the data that aren't populated well 
or are not likely to be useful, along with providing the data type, min, max and most frequent values for each column.

In [16]:
%%showsql
select target_column, data_type, distinct_values, missing_values, blank_values, min, max, most_frequent_values 
from nate.bic_azpp_o0200_sum
where fraction_blank < 0.999 and distinct_values > 1 and target_column not like 'bic%'


Unnamed: 0,target_column,data_type,distinct_values,missing_values,blank_values,min,max,most_frequent_values
0,plant,text,43,0,0,4.0,4.0,"[EWHG, EWHG, EWHG, EWHG, 4024, 4024, 4024, 5100, KOS1, KOS1]"
1,workcenter,text,1501,0,0,1.0,8.0,"[SE03, SE03, 0000, 0000, 0000, 1600-PA, FINLINSP, FINLINSP, 1355-PA, 1355-PA]"
2,pp_cap_ctg,text,3,0,0,3.0,3.0,"[002, 002, 002, 002, 001, 001, 001, 001, 008, 008]"
3,coorder,text,22120,0,284340,0.0,12.0,"[, , K20052900125, K20052900125, K20043900110, K10007900130, 000301430420, K20052900130, K10007900125, 000301333215]"
4,priority,text,9,0,509896,0.0,1.0,"[, , , , Q, Q, Q, 2, 2, 2]"
5,pp_capid,text,1872,0,0,8.0,8.0,"[10003398, 10003398, 10000639, 10000639, 10000639, 10000196, 10001125, 10001125, 10001125, 10000231]"
6,capa_unit,text,4,0,0,1.0,3.0,"[STD, STD, STD, STD, H, H, H, H, 10, 10]"
7,pur_aufpl,text,21413,0,0,10.0,10.0,"[0000000000, 0000000000, 0001175931, 0001175931, 0001007088, 0000938564, 0001175932, 0001505749, 0001403911, 0000938562]"
8,pur_aplzl,text,316,0,0,8.0,8.0,"[00000000, 00000000, 00000004, 00000004, 00000004, 00000004, 00000005, 00000005, 00000005, 00000003]"
9,me_order,text,25533,0,362325,0.0,10.0,"[, , , 0153860445, 0153420845, 0153931645, 0153931773, 0153222655, 0153016986, 0153864160]"


In [None]:
%%execsql

CREATE OR REPLACE FUNCTION my_to_date(text, text) RETURNS date AS $$

BEGIN
    IF to_number($1,'99999999') = 0 THEN
        RETURN null;
    ELSE 
        RETURN to_date($1, $2);
    END IF;
END;

$$ LANGUAGE plpgsql

The query below includes the columns that have good data.

In [4]:
%%showsql

select plant, my_to_date(finishdate, 'YYYYMMDD') as finishdate, workcenter, coorder, priority, bic_zcanum, bic_zharbid,bic_ztypkz,
pp_capid, bic_kruesoll, my_to_date(basicstart, 'YYYYMMDD') as basicstart, my_to_date(schedreldt, 'YYYYMMDD') as schedreldt, 
my_to_date(schedfindt, 'YYYYMMDD') as schedfindt, my_to_date(schedstart, 'YYYYMMDD') as schedstart, me_order, plan_type, 
s_ord_item, operation_text, oper_qty, oper_unit, duration, dur_unit, conf_scrap, confrm_qty, comp_code, equipment, funct_loc,
objnr, my_to_date(actstartdt, 'YYYYMMDD') as actstartdt, quantity, unit, oi_matnrr, tlist_use, plgrp, mrp_contrl, coord_type, 
ordcateg, my_to_date(cpr_relon, 'YYYYMMDD') as cpr_relon, costcenter, p_plant, ord_typ, sales_unit, 
my_to_date(expl_dat, 'YYYYMMDD') as expl_dat, opr_wrkctr, material, order_quan, po_unit, wbs_elemt
from sap_bi.bic_azpp_o0200_int limit 100;

Unnamed: 0,plant,finishdate,workcenter,coorder,priority,bic_zcanum,bic_zharbid,bic_ztypkz,pp_capid,bic_kruesoll,basicstart,schedreldt,schedfindt,schedstart,me_order,plan_type,s_ord_item,operation_text,oper_qty,oper_unit,duration,dur_unit,conf_scrap,confrm_qty,comp_code,equipment,funct_loc,objnr,actstartdt,quantity,unit,oi_matnrr,tlist_use,plgrp,mrp_contrl,coord_type,ordcateg,cpr_relon,costcenter,p_plant,ord_typ,sales_unit,expl_dat,opr_wrkctr,material,order_quan,po_unit,wbs_elemt
0,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955560.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-07-28,3090,P4000075177,1.0,EA,
1,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955560.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-07-28,3090,P4000075177,1.0,EA,
2,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955560.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-07-28,3090,P4000075177,1.0,EA,
3,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1100,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
4,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1100,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
5,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1100,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
6,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
7,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
8,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
9,SFBN,,3090,,,512,10003859,2,10003292,0.5,,,,,152955561.0,N,0,1800,1.0,EA,0.0,,0.0,0.0,GSEA,,,OV000160091400000010,,0.0,,,,,421,PR,0,,,SFBN,PR,EA,2016-06-24,3090,P4000075177,1.0,EA,
