In [16]:
%run '00_database_connectivity_setup.ipynb'

# Strategy
## 1. Collect features:
    1.1 Service Orders SAPSR3./BIC/AZCS_O0100 (not working now)
    1.2 Equipment sap_bi.bic_azequip00_int
    1.3 Material smithd2.material_features
    1.4 BOM smithd2.material_bom_exploded_enterprise
## 2. Cluster relevant materials
    2.1 Create table of materials on service orders
    2.2 Create distance metric
    2.3 Cluster
## 3. Create TARGET: component pctReplaced feature (Qty replaced / Qty serviced)
    3.1 Get Qty of each top level material serviced (with something replaced)
    3.2 Get Qty of each component exposed to service
    3.3 Get Qty of each component replaced
    3.4 Generate Metric
## 4. Create master table of features and Target

## 2. Cluster relevant materials
### 2.1 Create table of materials to be clustered

In [3]:
%%execsql
--get list of materials to be clustered
DROP TABLE if EXISTS nate.aro_components_HOU;
CREATE TABLE nate.aro_components_HOU AS 
SELECT distinct b.material
    from sap.service_order_int a
    inner join data_science.ordercomponents_int b
    on a.order_txt=b.ordr
    where plant in ('EWHG','HCS0', 'HCS1')
DISTRIBUTED BY(material);

In [4]:
%%showsql
SELECT COUNT(*) from nate.aro_components_HOU;

Unnamed: 0,count
0,9756


In [5]:
%%execsql
DROP TABLE if EXISTS nate.aro_materials_HOU;
CREATE TABLE nate.aro_materials_HOU AS 
 SELECT DISTINCT a.material 
    from sap.service_order_int a
    inner join data_science.ordercomponents_int b
    on a.order_txt=b.ordr   
    where plant in ('EWHG','HCS0', 'HCS1')
DISTRIBUTED BY (material);

In [6]:
%%showsql

select count(*) from nate.aro_materials_HOU;

Unnamed: 0,count
0,2370


### 2.2 Create Distance metric

In [7]:
%%execsql
--create distance metrics
DROP TABLE IF EXISTS nate.material_features_distance_components_HOU;
set statement_timeout = 0; 
CREATE TABLE nate.material_features_distance_components_HOU as
(    
    SELECT a.material as material, b.material as similar_material,     
     CASE WHEN a.material=b.material then 0
    --this case whenspeeds it up by limiting heavy math to at least one key field match
    WHEN
       ( a.prod_line = b.prod_line or a.model=b.model or  a.commodity_code=b.commodity_code
            or a.prod_family=b.prod_family or a.material=b.based_on or a.based_on=a.material or a.desc1=b.desc1) 
     THEN            
       ( case when a.weight>0.1 and b.weight>0.1 then 0.3 * (abs(a.weight-b.weight)/GREATEST(a.weight,b.weight)) else 0.1 end +    
        CASE WHEN a.model=b.model then 0 else 0.3 end +    
        case when a.material=b.based_on or a.based_on=a.material then 0 else 0.3 end +        
        (1.0 - smithd2.jaccard_index(a.documents, b.documents)) * .3 +   
        (1.0 - smithd2.jaccard_index(a.components, b.components)) * .4 + 
        case when char_length(a.desc1)>0 and char_length(b.desc1)>0 
            then 0.3 * pdltools.demerau_levenshtein_distance( a.desc1, b.desc1 )
                 / GREATEST(char_length(a.desc1),char_length(b.desc1))  
            else 0 end +    
        case when char_length(a.desc2)>0 and char_length(b.desc2)>0 
            then 0.2 * pdltools.demerau_levenshtein_distance( a.desc2, b.desc2 )
                 / GREATEST(char_length(a.desc2),char_length(b.desc2))  
            else 0 end + 
        case when char_length(a.desc3)>0 and char_length(b.desc3)>0 
            then 0.2 * pdltools.demerau_levenshtein_distance( a.desc3, b.desc3 )
                 / GREATEST(char_length(a.desc3),char_length(b.desc3))  
           else 0 end +
        case when char_length(a.desc4)>0 and char_length(b.desc4)>0 
           then 0.2 * pdltools.demerau_levenshtein_distance( a.desc4, b.desc4 )
                 / GREATEST(char_length(a.desc4),char_length(b.desc4))  
            else 0 end + 
        case when a.uom=b.uom then 0 else 0.3 end +
        case when a.prod_family=b.prod_family then 0 else 0.1 end +
        case when a.prod_line=b.prod_line then 0 else 0.1 end +
        (1.0 - smithd2.jaccard_index(a.matlspecs, b.matlspecs)) * .2 +
        (1.0 - smithd2.jaccard_index(a.weldspecs, b.weldspecs)) * .2 +
        case when a.serialization=b.serialization 
            or a.serialization is null and b.serialization is null then 0 
            when a.serialization is not null and b.serialization is not null         
            then 0.05 else 0.2 end +     
        (1.0 - smithd2.jaccard_index(a.qspecs, b.qspecs)) * .2 +
        (1.0 - smithd2.jaccard_index(a.qmsects, b.qmsects)) * .2 +       
        (1.0 - smithd2.jaccard_index(a.coatingspecs, b.coatingspecs)) * .2 +           
        case when a.material_type=b.material_type then 0 else 0.2 end +    
        case when a.commodity_code=b.commodity_code then 0 else 0.1 end) / 4.2
      ELSE 1.0 END
        as score
    FROM smithd2.material_features a
    INNER JOIN nate.aro_components_HOU c
    ON a.material = c.material
    CROSS JOIN smithd2.material_features b
    INNER JOIN  nate.aro_components_HOU d
    ON b.material = d.material
    where 
        a.material>=b.material  
)
DISTRIBUTED BY (material);

In [9]:
%%execsql
--create distance metrics
DROP TABLE IF EXISTS nate.material_features_distance_materials_HOU;
set statement_timeout = 0; 
CREATE TABLE nate.material_features_distance_materials_HOU as
(    
    SELECT a.material as material, b.material as similar_material,     
     CASE WHEN a.material=b.material then 0
    --this case whenspeeds it up by limiting heavy math to at least one key field match
    WHEN
       ( a.prod_line = b.prod_line or a.model=b.model or  a.commodity_code=b.commodity_code
            or a.prod_family=b.prod_family or a.material=b.based_on or a.based_on=a.material or a.desc1=b.desc1) 
     THEN            
       ( case when a.weight>0.1 and b.weight>0.1 then 0.3 * (abs(a.weight-b.weight)/GREATEST(a.weight,b.weight)) else 0.1 end +    
        CASE WHEN a.model=b.model then 0 else 0.3 end +    
        case when a.material=b.based_on or a.based_on=a.material then 0 else 0.3 end +        
        (1.0 - smithd2.jaccard_index(a.documents, b.documents)) * .3 +   
        (1.0 - smithd2.jaccard_index(a.components, b.components)) * .4 + 
        case when char_length(a.desc1)>0 and char_length(b.desc1)>0 
            then 0.3 * pdltools.demerau_levenshtein_distance( a.desc1, b.desc1 )
                 / GREATEST(char_length(a.desc1),char_length(b.desc1))  
            else 0 end +    
        case when char_length(a.desc2)>0 and char_length(b.desc2)>0 
            then 0.2 * pdltools.demerau_levenshtein_distance( a.desc2, b.desc2 )
                 / GREATEST(char_length(a.desc2),char_length(b.desc2))  
            else 0 end + 
        case when char_length(a.desc3)>0 and char_length(b.desc3)>0 
            then 0.2 * pdltools.demerau_levenshtein_distance( a.desc3, b.desc3 )
                 / GREATEST(char_length(a.desc3),char_length(b.desc3))  
           else 0 end +
        case when char_length(a.desc4)>0 and char_length(b.desc4)>0 
           then 0.2 * pdltools.demerau_levenshtein_distance( a.desc4, b.desc4 )
                 / GREATEST(char_length(a.desc4),char_length(b.desc4))  
            else 0 end + 
        case when a.uom=b.uom then 0 else 0.3 end +
        case when a.prod_family=b.prod_family then 0 else 0.1 end +
        case when a.prod_line=b.prod_line then 0 else 0.1 end +
        (1.0 - smithd2.jaccard_index(a.matlspecs, b.matlspecs)) * .2 +
        (1.0 - smithd2.jaccard_index(a.weldspecs, b.weldspecs)) * .2 +
        case when a.serialization=b.serialization 
            or a.serialization is null and b.serialization is null then 0 
            when a.serialization is not null and b.serialization is not null         
            then 0.05 else 0.2 end +     
        (1.0 - smithd2.jaccard_index(a.qspecs, b.qspecs)) * .2 +
        (1.0 - smithd2.jaccard_index(a.qmsects, b.qmsects)) * .2 +       
        (1.0 - smithd2.jaccard_index(a.coatingspecs, b.coatingspecs)) * .2 +           
        case when a.material_type=b.material_type then 0 else 0.2 end +    
        case when a.commodity_code=b.commodity_code then 0 else 0.1 end) / 4.2
      ELSE 1.0 END
        as score
    FROM smithd2.material_features a
    INNER JOIN nate.aro_materials_HOU c
    ON a.material = c.material
    CROSS JOIN smithd2.material_features b
    INNER JOIN  nate.aro_materials_HOU d
    ON b.material = d.material
    where 
        a.material>=b.material  
)
DISTRIBUTED BY (material);

In [8]:
%%showsql

select count(*) from nate.material_features_distance_components_HOU;

Unnamed: 0,count
0,29510403


In [10]:
%%showsql

select count(*) from nate.material_features_distance_materials_HOU;

Unnamed: 0,count
0,1871145


### 2.3 Create Clusters

In [12]:
# actually large, ran with 0.75
sql = """
drop table if exists nate.hclust_components75_HOU;
create table nate.hclust_components75_HOU
as
(
    select (result).material, (result).cluster_number
    from
    (
        select nate.run_hclust(key, 0.75) as result 
        from 
        (
            select
                nate.stack_rows( ARRAY['material', 'similar_material', 'score'],
                    material, 
                    similar_material, 
                    score) as key
            from nate.material_features_distance_components_HOU
            where  material >= similar_material              
        ) q1
    ) q2
) distributed by (material);
"""
    
psql.execute(sql, conn)
conn.commit()

In [13]:
# actually large, ran with 0.75
sql = """
drop table if exists nate.hclust_components85_HOU;
create table nate.hclust_components85_HOU
as
(
    select (result).material, (result).cluster_number
    from
    (
        select nate.run_hclust(key, 0.85) as result 
        from 
        (
            select
                nate.stack_rows( ARRAY['material', 'similar_material', 'score'],
                    material, 
                    similar_material, 
                    score) as key
            from nate.material_features_distance_components_HOU
            where  material >= similar_material              
        ) q1
    ) q2
) distributed by (material);
"""
    
psql.execute(sql, conn)
conn.commit()

In [14]:
# actually large, ran with 0.75
sql = """
drop table if exists nate.hclust_materials75_HOU;
create table nate.hclust_materials75_HOU
as
(
    select (result).material, (result).cluster_number
    from
    (
        select nate.run_hclust(key, 0.75) as result 
        from 
        (
            select
                nate.stack_rows( ARRAY['material', 'similar_material', 'score'],
                    material, 
                    similar_material, 
                    score) as key
            from nate.material_features_distance_materials_HOU
            where  material >= similar_material              
        ) q1
    ) q2
) distributed by (material);
"""
    
psql.execute(sql, conn)
conn.commit()

In [15]:
# actually large, ran with 0.85
sql = """
drop table if exists nate.hclust_materials85_HOU;
create table nate.hclust_materials85_HOu
as
(
    select (result).material, (result).cluster_number
    from
    (
        select nate.run_hclust(key, 0.85) as result 
        from 
        (
            select
                nate.stack_rows( ARRAY['material', 'similar_material', 'score'],
                    material, 
                    similar_material, 
                    score) as key
            from nate.material_features_distance_materials_HOU
            where  material >= similar_material              
        ) q1
    ) q2
) distributed by (material);
"""
    
psql.execute(sql, conn)
conn.commit()

In [17]:
%%showsql
SELECT max(cluster_number) from nate.hclust_components75_HOU
limit 5;

Unnamed: 0,max
0,586


In [18]:
%%showsql
SELECT max(cluster_number) from nate.hclust_components85_HOU
limit 5;

Unnamed: 0,max
0,111


In [19]:
%%showsql
SELECT max(cluster_number) from nate.hclust_materials75_HOU
limit 5;

Unnamed: 0,max
0,250


In [20]:
%%showsql
SELECT max(cluster_number) from nate.hclust_materials85_HOU
limit 5;

Unnamed: 0,max
0,48


## 3. Create TARGET: component pctReplaced feature (Qty replaced / Qty serviced)

In [21]:
%%execsql

--3.1 Get Qty of each top level material serviced (with something replaced)
DROP TABLE IF EXISTS nate.serviced_materials_HOU;
CREATE TABLE nate.serviced_materials_HOU AS 
(
    SELECT a.material, plant, count(distinct order_txt) as order_qty
    FROM sap.service_order_int a
    inner join data_science.ordercomponents_int b
    on a.order_txt = b.ordr
    where plant in ('EWHG','HCS0', 'HCS1')
    group by 1, 2
) DISTRIBUTED BY (material);

In [22]:
%%execsql
--3.2 Get Qty of each component exposed to service
DROP TABLE IF EXISTS nate.serviced_components_HOU;
CREATE TABLE nate.serviced_components_HOU AS 
(
    select c.material, c.plant, d.component, d.unit as uom, sum(c.order_qty*d.quantity) as qty_serviced
    from nate.serviced_materials c
    inner join smithd2.material_bom_exploded_enterprise d
    on c.material=d.material
    where plant in ('EWHG','HCS0', 'HCS1')
    group by 1, 2, 3, 4
) DISTRIBUTED BY (component);


In [23]:
%%execsql
--3.3 Get Qty of each component replaced
DROP TABLE IF EXISTS nate.replaced_components_HOU;
CREATE TABLE nate.replaced_components_HOU AS 
( 
    SELECT h.material, plant, g.material as component, g.uom, sum(g.qtywithdrawn) as qty_replaced
    from data_science.ordercomponents_int g
    inner join sap.service_order_int h 
    on g.ordr = h.order_txt
    where plant in ('EWHG','HCS0', 'HCS1')
    group by 1,2,3,4
) DISTRIBUTED BY (component);


In [25]:
%%execsql
--3.4 Generate Metric
DROP TABLE IF EXISTS nate.components_pct_replaced_HOU;
CREATE TABLE nate.components_pct_replaced_HOU AS 
(
    SELECT e.material, e.plant, e.component, e.uom, 
            case when f.qty_replaced is null then 0 else f.qty_replaced end as qty_replaced,
            e.qty_serviced,
            case when f.qty_replaced is null or e.qty_serviced=0 then 0 
                else f.qty_replaced / e.qty_serviced end as pctReplaced 
    from  nate.serviced_components e
    left outer join nate.replaced_components f
    on e.material=f.material and e.plant=f.plant and e.component=f.component and e.uom=f.uom
    where e.plant in ('EWHG','HCS0', 'HCS1')
) DISTRIBUTED BY (component);

In [26]:
%%showsql
select * from  nate.components_pct_replaced_HOU
limit 100;

Unnamed: 0,material,plant,component,uom,qty_replaced,qty_serviced,pctreplaced
0,P1000003515,EWHG,100013686,EA,0.0,16.0,0.0
1,P1000003515,EWHG,100040260,EA,0.0,14.0,0.0
2,P1000003515,EWHG,7104962,EA,0.0,1.0,0.0
3,P1000003515,EWHG,P1000056621,EA,0.0,1.0,0.0
4,P1000003515,EWHG,P180172,EA,0.0,1.0,0.0
5,P1000003515,EWHG,P180712,EA,0.0,2.0,0.0
6,P1000008020,HCS0,100013686,EA,0.0,22.0,0.0
7,P1000008020,HCS0,100040260,EA,0.0,14.0,0.0
8,P1000008020,HCS0,P136650-1039,EA,44.0,10.0,4.4
9,P1000012632,EWHG,48-081-998,EA,0.0,1.0,0.0


## 4. Create master table of features and Target

In [29]:
%%execsql
DROP TABLE IF EXISTS nate.component_features_HOU;
CREATE TABLE nate.component_features_HOU
AS 
(
    SELECT a.material, a,plant, a.component, a.uom, a.qty_replaced, a.qty_serviced, a.pctreplaced,   
    
    b.weight as m_weight,
    b.material_type as m_material_type,  
    b.material_group as m_material_group,    
    b.surface_matl as m_surface_matl, 
    b.subsea_matl as m_subsea_matl,     
    (b.material_type is not null) as m_has_materialtype,
    (b.coatingspecs is not null) as m_has_coatings,
    (b.documents is not null) as m_has_documents, 
    (b.matlspecs is not null) as m_has_matlspecs,
    (b.weldspecs is not null) as m_has_weldspecs, 
    (b.qspecs is not null) as m_has_qspecs,
    
    b.weight as c_weight,
    b.material_type as c_material_type,  
    b.material_group as c_material_group,    
    b.surface_matl as c_surface_matl, 
    b.subsea_matl as c_subsea_matl,     
    (b.material_type is not null) as c_has_materialtype,
    (b.coatingspecs is not null) as c_has_coatings,
    (b.documents is not null) as c_has_documents, 
    (b.matlspecs is not null) as c_has_matlspecs,
    (b.weldspecs is not null) as c_has_weldspecs, 
    (b.qspecs is not null) as c_has_qspecs,
    
    d.cluster_number::text as m_clust75,
    e.cluster_number::text as m_clust85,
    f.cluster_number::text as c_clust75,
    g.cluster_number::text as c_clust85
   
    FROM  nate.components_pct_replaced_HOU a
    INNER JOIN smithd2.material_features b  ON a.material = b.material
    INNER JOIN smithd2.material_features c  ON a.component = c.material  
    INNER join nate.hclust_materials75_HOU d on a.component = d.material
    INNER join nate.hclust_materials85_HOU e on a.component = e.material
    INNER join nate.hclust_components75_HOU f on a.component = f.material
    INNER join nate.hclust_components85_HOU g on a.component = g.material
) DISTRIBUTED BY (component);


In [31]:
%%showsql 
SELECT * FROM  nate.component_features_HOU
limit 10;

Unnamed: 0,material,a,plant,component,uom,qty_replaced,qty_serviced,pctreplaced,m_weight,m_material_type,m_material_group,m_surface_matl,m_subsea_matl,m_has_materialtype,m_has_coatings,m_has_documents,m_has_matlspecs,m_has_weldspecs,m_has_qspecs,c_weight,c_material_type,c_material_group,c_surface_matl,c_subsea_matl,c_has_materialtype,c_has_coatings,c_has_documents,c_has_matlspecs,c_has_weldspecs,c_has_qspecs,m_clust75,m_clust85,c_clust75,c_clust85
0,P1000075683,"(P1000075683,HCS0,P142469,EA,0,1.00000,0)",HCS0,P142469,EA,0.0,1.0,0.000000,4365.000,HALB,A-T03-RUN,False,True,True,False,True,False,False,False,4365.000,HALB,A-T03-RUN,False,True,True,False,True,False,False,False,154,31,389,70
1,P1000053749,"(P1000053749,HCS0,P1000022849,EA,21.0000,60.00000,0.35000000000000000000)",HCS0,P1000022849,EA,21.0,60.0,0.350000,4900.000,HALB,F-S01-MMW,True,True,True,False,True,False,False,False,4900.000,HALB,F-S01-MMW,True,True,True,False,True,False,False,False,219,41,324,59
2,P170691,"(P170691,HCS1,P1000038613,EA,0,3.000000000000000,0)",HCS1,P1000038613,EA,0.0,3.0,0.000000,102000.000,HALB,9999,True,True,True,False,False,False,False,False,102000.000,HALB,9999,True,True,True,False,False,False,False,False,152,31,42,15
3,P170691,"(P170691,HCS0,P1000038613,EA,3.0000,15.000000000000000,0.20000000000000000000)",HCS0,P1000038613,EA,3.0,15.0,0.200000,102000.000,HALB,9999,True,True,True,False,False,False,False,False,102000.000,HALB,9999,True,True,True,False,False,False,False,False,152,31,42,15
4,P170691,"(P170691,EWHG,P1000038613,EA,3.0000,12.000000000000000,0.25000000000000000000)",EWHG,P1000038613,EA,3.0,12.0,0.250000,102000.000,HALB,9999,True,True,True,False,False,False,False,False,102000.000,HALB,9999,True,True,True,False,False,False,False,False,152,31,42,15
5,P150323,"(P150323,HCS1,P1000038613,EA,0,9.000000000000000,0)",HCS1,P1000038613,EA,0.0,9.0,0.000000,101000.000,HALB,9999,True,True,True,False,True,False,False,False,101000.000,HALB,9999,True,True,True,False,True,False,False,False,152,31,42,15
6,P150323,"(P150323,HCS0,P1000038613,EA,0,21.000000000000000,0)",HCS0,P1000038613,EA,0.0,21.0,0.000000,101000.000,HALB,9999,True,True,True,False,True,False,False,False,101000.000,HALB,9999,True,True,True,False,True,False,False,False,152,31,42,15
7,P1000007820,"(P1000007820,HCS1,P1000038613,EA,0,30.000000000000000,0)",HCS1,P1000038613,EA,0.0,30.0,0.000000,102000.000,HALB,99,True,True,True,False,False,False,False,False,102000.000,HALB,99,True,True,True,False,False,False,False,False,152,31,42,15
8,P1000007820,"(P1000007820,HCS0,P1000038613,EA,3.0000,24.000000000000000,0.12500000000000000000)",HCS0,P1000038613,EA,3.0,24.0,0.125000,102000.000,HALB,99,True,True,True,False,False,False,False,False,102000.000,HALB,99,True,True,True,False,False,False,False,False,152,31,42,15
9,P157045,"(P157045,HCS1,P154440,EA,0,3.000000000000000,0)",HCS1,P154440,EA,0.0,3.0,0.000000,80900.000,HALB,A-S15-TRS,True,True,True,False,True,False,False,False,80900.000,HALB,A-S15-TRS,True,True,True,False,True,False,False,False,90,21,422,76


# SCRATCH

In [35]:
%%showsql
select * 
from sap_bi.bic_azcs_o0100_int
where trim(leading '0' from cs_order)='1125260'
limit 10

Unnamed: 0,cs_order,recordmode,coord_type,assembly,ord_proccd,ord_procwd,unit_day,equipment,calday,pmacttype,pmplangrp,planplant,customer,storno,division,funct_loc,salesorg,distr_chan,order_tot,ord_open,ord_immed,ord_intime,ord_unplnd,bic_zabckz,spl_refdt,bic_zreftime,bic_zaenam,bic_zaezeit,bic_zakknz,bic_zsystatus,aedat,bic_zarmsetup,bic_zuserstat,bic_zaufpl,bic_zawerk,plantsectn,comp_code,bic_zcallsign,bic_zchg_by,bic_zcrt_by,cpr_create,bic_zcrtdat,gm_ussta,gm_grntp,bic_zextid,bic_zfligtim,bic_zflgdurut,bic_zforcecnt,bic_zactrel,bic_zreledt,...,bic_zphase,bic_zisflight,me_order,bic_zord_itms,co_area,costcenter,bic_zcryptost,bic_zlanddat,bic_zlandloc,bic_zlandtmzn,bic_zlattkoff,bic_zlanchpad,bic_zlogsys,bic_zmission,bic_zwsid,bic_zmsgrp,bic_zperson,bic_zopercnt,cpr_zuguid,cpr_cobjec,bic_zotype,bic_zplvar,bic_zorplind,priority,bic_zqmnum,revision,bic_zstartdt,bic_zstime,pp_locat,maintplant,cpr_cet,bic_ztgloc,bic_ztmovertg,bic_zsaleoff,bic_zsalesgrp,bic_zmainitm,bic_zmainplan,ord_closed,ord_plnd,bic_zernam,bic_zzstatus,bic_ztimzone,bic_zworkcntr,bic_zoperid,bic_zdesc,bic_zopobjtyp,bic_zordrnrun,bic_zchgdate,bic_zcredate,bic_zobjnr
0,1125260,,ZS03,,5.0,5.0,10,,20110314,6,200,MYLB,47533,,1,,MYSS,1,1,0,0,0,1,,20110314,0,THONGYQ,21416,,CLSD CNF CCOK MANC NMAT PRC,20110314,,,1001043044,MYLB,,EWMC,,,,0,33512,,,,0,0,0,20110314,20110318,...,6,,40022204,10,KOS1,,,0,,0,0,,,,,,0,0,,0,A,,,3,,,0,0,,,CST,,0,,,,,1,0,OTHMANSS,,,9929010,,,,0,20120215,0,OR000001125260


In [None]:
%%execsql
-- this table has 1 row for each component replaced on a service order, plus other features

DROP TABLE IF EXISTS nate.components_pct_replaced_features;
CREATE TABLE nate.components_pct_replaced_features AS 
    (
    SELECT a.ordr, 
        case when asong.is_numeric(c.material) then trim(leading '0' from c.material) else c.material end as material
        , e.txtmd as material_desc, a.material AS component, d.txtmd as component_desc, a.qtywithdrawn, a.uom, a.plant
    FROM data_science.ordercomponents_int a 
    JOIN sap_bi.bic_azcs_o0100_int b
    ON a.ordr = trim(leading '0' from b.cs_order)
    JOIN sap_bi.bic_azequip00_int c
    ON c.equipment = b.equipment
    JOIN sap_bi.bi0_tmaterial_int d
    ON d.material = a.material
    JOIN sap_bi.bi0_tmaterial_int e
    ON e.material = c.material
    WHERE a.qtywithdrawn > 0 and a.material is not null and char_length(a.material) > 0 and d.langu = 'E' and e.langu = 'E'
    )
DISTRIBUTED BY (ordr);

In [60]:
%%execsql
DROP TABLE IF EXISTS nate.component_features;
CREATE TABLE nate.component_features 
AS (SELECT a.component, a.orders_replaced, a.orders_serviced, a.pctreplaced,    
    
    b.weight as material_weight,
    b.material_type as material_material_type,  
    b.material_group,
    b.surface_matl as material_surface_matl, 
    b.subsea_matl as material_subsea_matl, 
    
    (b.material_type is not null) as has_materialtype,
    (b.coatingspecs is not null) as has_coatings,
    (b.documents is not null) as has_documents, 
    (b.matlspecs is not null) as has_matlspecs,
    (b.weldspecs is not null) as has_weldspecs, 
    (b.qspecs is not null) as has_qspecs,    
    c.*
    FROM  nate.replacedvsserviced a
    INNER JOIN smithd2.material_features b  ON a.component = b.material 
    INNER join nate.cluster_master c on a.component=c.material
    )

In [None]:
%%execsql
DROP TABLE IF EXISTS nate.full_features;
CREATE TABLE nate.full_features 
AS (SELECT a.ordr, a.material, a.material_desc, a.component, 
    a.component_desc, a.uom, a.plant, b.orders_replaced, b.orders_serviced, b.pctreplaced,
    c.ord_proccd as leadtimecalday, c.ord_procwd as leadtimeworkday, c.calday, 
    c.pmplangrp, c.pmacttype, c.division, c.customer, c.funct_loc, c.salesorg, 
    c.spl_refdt as refdat, c.bic_zabckz as ABCinc, c.ord_intime, c.aedat as changedon, 
    c.bic_zaenam as changedby, c.bic_zactfinsh as actfinish, c.actstartdt as actstart,
    c.bic_zactfinsh, c.bic_zreledt, c.bic_zakknz as category, c.bic_zawerk, 
    c.bic_zcrtdat, c.bic_zsystatus, c.workcenter, c.bus_area, c.bic_zphase, c.costcenter, c.priority, 
    c.bic_zernam as createdby, d.coatingspecs as material_coatingspecs, d.documents as material_documents, 
    d.matlspecs as material_matlspecs, d.weldspecs as material_weldspecs, 
    d.qspecs as material_qspecs, d.weight as material_weight, d.model as material_model, 
    d.material_type as material_material_type, 
    d.material_group as material_material_group, d.surface_matl as material_surface_matl, 
    d.subsea_matl as material_subsea_matl, e.coatingspecs as component_coatingspecs, 
    e.documents as component_documents, e.matlspecs as component_matlspecs, e.weldspecs 
    as component_weldspecs, e.qspecs as component_qspecs, e.weight as component_weight, 
    e.model as component_model, e.material_type as component_material_type, e.material_group as component_material_group, 
    e.surface_matl as component_surface_matl,e.subsea_matl as component_subsea_matl
FROM nate.simple_features a
JOIN nate.replacedvsserviced b
ON a.material = b.component
JOIN sap_bi.bic_azcs_o0100_int c
ON a.ordr = trim(leading '0' from c.cs_order)
JOIN smithd2.material_features d
ON a.material = d.material 
JOIN smithd2.material_features e
ON a.component = e.material)

In [None]:
%%execsql
DROP TABLE IF EXISTS nate.prediction_input;
CREATE TABLE nate.prediction_input AS
SELECT component, material, orders_replaced, orders_serviced, pctreplaced, leadtimecalday, 
my_to_date(actstart, 'YYYYMMDD') as actstart, ABCinc, bic_zphase::int, (material_coatingspecs is not null) as matl_has_coatings,
(material_documents is not null) as matl_has_documents, (material_matlspecs is not null) 
as matl_has_matlspecs,
(material_weldspecs is not null) as matl_has_weldspecs, (material_qspecs is not null) 
as matl_has_qspecs, (component_coatingspecs is not null) as has_coatings,
(component_documents is not null) as has_documents, (component_matlspecs is not null) 
as has_matlspecs,
(component_weldspecs is not null) as has_weldspecs, (component_qspecs is not null) 
as has_qspecs, 
material_weight, material_surface_matl, material_subsea_matl, component_weight, component_surface_matl, 
component_subsea_matl,
b.*
FROM nate.full_features a
inner join nate.cluster_master b on a.component=b.material