# Purpose


This notebook demonstrates the data pipeline from raw tables to analytical datasets. At the end of this activity, train & test data sets are created from raw data.



## Imports

In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True

from datetime import datetime
from dateutil.relativedelta import relativedelta

In [43]:
import warnings

warnings.filterwarnings('ignore', message="The sklearn.metrics.classification module", category=FutureWarning)
warnings.filterwarnings('ignore', message=".*title_format is deprecated. Please use title instead.*")
warnings.filterwarnings('ignore', message="optional dependency `torch` is not available. - skipping import of NN models.")

In [44]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment
)
import ta_lib.core.api as dataset
import ta_lib.eda.api as ta_analysis
import ta_lib.reports.api as reports

os.environ['TA_DEBUG'] = "False"
os.environ['TA_ALLOW_EXCEPTIONS'] = "True"

# Initialization
initialize_environment(debug=False, hide_warnings=True)

## Utility functions

In [45]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
print(dataset.list_datasets(context))

['/raw/product', '/raw/opportunity', '/cleaned/target', '/cleaned/product', '/cleaned/opportunity', '/processed/merged', '/train/merged/features', '/train/merged/target', '/test/merged/features', '/test/merged/target', '/score/merged/output']


## Load, Clean, and Engineer data

In [46]:
from statistics import mode
# Loading raw datasets in a loop
data = dict()

for i in dataset.list_datasets(context):
    if '/raw/' in i:
        dataset_name = i.replace('/raw/','')
        key_ = dataset_name+'_df'
        data[key_] = dataset.load_dataset(context,i)
        
        # Standardize column names
        data[key_].columns = string_cleaning(data[key_].columns,lower=True).str.replace(' ','_')
        
        # Changes
        if key_ == 'opportunity_df':
            # Fill na values with default values
            data[key_]['opportunity_id'] = data[key_]['opportunity_id'].apply(lambda x: 'opp_' + str(x))
            data[key_]['customer_name'] = data[key_]['customer_name'].apply(lambda x: 'cus_' + x.split(' ')[1])
            data[key_]['transition_from_timestamp'] = data[key_]['transition_from_timestamp'].fillna(data[key_]['transition_to_timestamp']).astype('int64').copy()
            data[key_]['transition_from_stage_name'] = data[key_]['transition_from_stage_name'].fillna('Stage 0').astype('object').copy()
            data[key_]['opportunity_type'] = data[key_]['opportunity_type'].fillna(data[key_]['opportunity_type'].mode()[0]).copy()
            
            # Fill na values in opportunity_status
            temp_df = data[key_].groupby(['transition_to_stage'])['opportunity_status'].apply(lambda x: mode(list(x))).to_dict()
            data[key_]['opportunity_status'] = data[key_]['transition_to_stage'].map(temp_df)
            
            # Data type change for dates
            data[key_]['decision_date'] = pd.to_datetime(data[key_]['decision_date'], unit='D', origin='1899-12-30')
            data[key_]['creation_date'] = pd.to_datetime(data[key_]['creation_date'], unit='D', origin='1899-12-30')

            data[key_]['transition_from_datetime'] = pd.to_datetime(data[key_]['transition_from_timestamp'], format='%Y%m%d%H%M%S')
            data[key_]['transition_to_datetime'] = pd.to_datetime(data[key_]['transition_to_timestamp'], format='%Y%m%d%H%M%S')
            
            # Feature Addition
            stage_level = {'Stage 6': 1, 'Stage 5': 1, 'Stage 3': 2, 'Stage 2': 3, 'Stage 4': 4, 'Stage 9': 4, 'Stage 1': -4, 'Stage 8': -4, 'Stage 7': 0}
            data[key_]['stage_level'] = data[key_]['transition_to_stage'].map(stage_level)
            data[key_]['stage_level'] = data[key_]['stage_level']/data[key_]['stage_level'].max()
            
            data[key_]['transition_time_days'] = ((data[key_]['transition_to_datetime'] - data[key_]['transition_from_datetime']).dt.total_seconds()/(3600*24)).round(2)
            data[key_]['decision_time_days'] = ((data[key_]['decision_date'] - data[key_]['creation_date']).dt.total_seconds()/(3600*24)).round(2)
            data[key_]['early_stages'] = data[key_]['transition_from_stage_name'].apply(lambda x: x in ['Stage 0', 'Stage 5', 'Stage 6'])
            
            # Replacing Geography NA values to 1.
            data[key_]['geography'].replace('Geo NA','Geo 1', inplace=True)
                        
            # Drop columns with missing values more than 50%
            data[key_].drop(['risk_status'], axis=1, inplace=True)
            
            # Convertion Label
            data[key_]['did_convert'] = data[key_]['opportunity_status'] == 'Closed/Won'

        elif key_ == 'product_df':
            # Dropping NA in Product Status Column
            data[key_].dropna(inplace=True)
            
            data[key_]['opportunity_id'] = data[key_]['opportunity_id'].apply(lambda x: 'opp_' + str(x))
            data[key_]['product_id'] = data[key_]['product_id'].apply(lambda x: 'pro_' + str(x))
            data[key_]['decision_date'] = pd.to_datetime(data[key_]['decision_date'], unit='D', origin='1899-12-30')
            data[key_]['snapshot_datetime'] = pd.to_datetime(data[key_]['snapshot_time'], format='%Y%m%d%H%M%S')
            
            # Drop columns with no value
            data[key_].drop(['product_segment_name'], axis=1, inplace=True)
                        
            # Convertion Label
            data[key_]['target'] = data[key_]['product_status'].apply(lambda x: x in ['Win Approved', 'Win Submitted'])*1         
        
        # Drop duplicates
        data[key_].drop_duplicates(inplace=True)
        
        # Save processed data
        dataset.save_dataset(context, data[key_], 'cleaned/'+ dataset_name)

### Merge Opportunity with Product

In [47]:
merge = data['opportunity_df'].merge(data['product_df'], how='inner', 
                                     left_on=['opportunity_id','transition_to_timestamp'],
                                     right_on=['opportunity_id','snapshot_time'])

In [48]:
merge.shape

(97901, 30)

In [49]:
# Remove Duplicate Values from specified features
duplicated_features = list(set(merge.columns) - {'transition_to_timestamp', 'transition_from_timestamp', 'transition_to_datetime','transition_time_days','transition_from_datetime','snapshot_datetime','snapshot_time'})
merge_trimmed = merge.drop_duplicates(duplicated_features).copy()

### Encoding Features with relavant Target Value.

In [50]:
# Target Encoding the Categorical Features
cat_columns = list(set(merge_trimmed.select_dtypes(include=['object','bool']).columns) - set(['customer_name','opportunity_id','oppo_pro_tr_time_geo_id']))

merge_target_encoded = merge_trimmed.copy()

for col in cat_columns:
    temp_map = merge_target_encoded.groupby([col])['target'].mean().to_dict()
    merge_target_encoded[col] = merge_target_encoded[col].map(temp_map)

In [51]:
summaries = [ta_analysis.get_variable_summary(merge_target_encoded).assign(Null_Values=pd.Series(merge_target_encoded.isna().sum()))]
display_as_tabs([(('{0} - {1} rows'.format('merge', str(merge_target_encoded.shape[0]))), summaries[0])])

elementwise comparison failed; this will raise an error in the future.


## Assignment Questions

In [52]:
# Statistical Significance
alpha = 0.05

#### 1. Is time taken between stage transitions significantly different across geographies?
* `h0` : Geographies have __same__ population mean for stage transition time.
* `h1` : Geographies have __different__ population mean for stage transition time.

In [53]:
print(merge.groupby(['geography'])['transition_time_days'].mean().round(2))
ta_analysis.get_bivariate_plots(df=merge, x_cols='geography', y_cols='transition_time_days')

geography
Geo 1    4.53
Geo 2    5.12
Geo 3    5.13
Geo 4    5.18
Geo 5    5.18
Name: transition_time_days, dtype: float64


In [54]:
cont_tab = pd.crosstab(merge['geography'], merge['transition_time_days'])
stat, p, dof, expected = stats.chi2_contingency(cont_tab)

if p < alpha:
    print('Reject Null Hypothesis')
else:
    print('Failed to Reject Null Hypothesis')

print('p-value:', round(p, 3))

Reject Null Hypothesis
p-value: 0.0


* At __5% significance__, We can __reject the null hypothesis__, that means, there is a significant difference in transition time for different geographies.

#### 2. Are Losses more likely to happen in early stages i.e., Stage 5, 6
* `h0`: Mean of Conversion to Win, is __more__ in Early Stages than in Later Stages
* `h1`: Mean of Conversion to Win, is __less__ in Early Stages than in Later Stages

In [55]:
print(merge.groupby(['early_stages'])['did_convert'].mean())
ta_analysis.get_bivariate_plots(df=merge, x_cols='did_convert', y_cols='early_stages')

early_stages
False    0.753795
True     0.043778
Name: did_convert, dtype: float64


In [56]:
# from scipy.stats.contingency import crosstab
cont_tab = pd.crosstab(merge['did_convert'], merge['early_stages'])
stat, p, dof, expected = stats.chi2_contingency(cont_tab)

if p < alpha:
    print('Reject Null Hypothesis')
else:
    print('Failed to Reject Null Hypothesis')

print('p-value:', round(p, 3))

Reject Null Hypothesis
p-value: 0.0


* At __5% significance__, We can __reject the null hypothesis__, that means, there are more losses in early stages than in later stages.

#### 3. Does the proportion of Won lines in an Opportunity significantly different across Product Categories?
* `h0`: Mean of Conversion to Win, is __same__ across all product categories (Unrelated)
* `h1`: Mean of Conversion to Win, is __different__ across all product categories (Related)

In [57]:
print(merge.groupby(['core_product_segment'])['did_convert'].mean())
ta_analysis.get_bivariate_plots(df=merge, x_cols='core_product_segment', y_cols='did_convert')

core_product_segment
Core Prd Seg 2    0.000000
Core Prd Seg 3    0.638216
Core Prd Seg 4    0.485122
Core Prd Seg 5    0.170297
Core Prd Seg 6    0.338983
Core Prd Seg 7    0.126761
Core Prd Seg 8    0.244330
Core Prd Seg 9    0.000000
Name: did_convert, dtype: float64


In [58]:
cont_tab = pd.crosstab(merge['core_product_segment'], merge['did_convert'])
stat, p, dof, expected = stats.chi2_contingency(cont_tab)

if p < alpha:
    print('Reject Null Hypothesis')
else:
    print('Failed to Reject Null Hypothesis')

print('p-value:', round(p, 3))

Reject Null Hypothesis
p-value: 0.0


* At __5% significance__, We can __reject the null hypothesis__, that means, proportion of won is different for different product categories.

## Split Data into Training and Testing Datasets

In [59]:
drop_cols = list(merge_trimmed.select_dtypes(include=['datetime64']).columns) + list(['product_status','opportunity_id','customer_name','transition_to_timestamp','transition_from_timestamp','snapshot_time', 'target'])

train_X, test_X, train_y, test_y = train_test_split(
    merge_target_encoded.drop(drop_cols, axis=1), 
    merge_target_encoded['target'], 
    test_size=0.2, 
    random_state=0
)

train_X = train_X.reset_index()
train_y = train_y.reset_index()
test_X = test_X.reset_index()
test_y = test_y.reset_index()

print(f"Saving training datasets")
dataset.save_dataset(context, train_X, 'train/merged/features')
dataset.save_dataset(context, train_y, 'train/merged/target')

print(f"Saving test datasets")
dataset.save_dataset(context, test_X, 'test/merged/features')
dataset.save_dataset(context, test_y, 'test/merged/target')

Saving training datasets
Saving test datasets
