# Pre-Process TRAIN/TEST Datasets

## 1. TRAIN/TEST Datasets

In [1]:
%%time

### 0. Import Required Packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

### Import Customer Transformers
import PreProcessing_Custom_Transformers_v2 as PP
import FeatureEngineering_Custom_Transformers as FE
import FeatureCreation_Custom_Transformers as FC

# Use the Updated Attribute/Imputation Dictionaries!!!
%run 'data_new/attribute_dictionary.py'
%run 'data_new/imputation_dictionary.py'

# Remove DataConversionWarning
import warnings
warnings.simplefilter('ignore')

CPU times: user 1.25 s, sys: 1.21 s, total: 2.46 s
Wall time: 22.8 s


### Fios

In [2]:
%%time

df_train = pd.read_pickle('data_new/CLUSTER_df_train_Fios Competitive Area.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_Fios Competitive Area.pkl')

# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

del df_train, df_test



### Pre-Processing
# (1) Make a Pipeline and Instantiate
Pipe_PP = PP.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# Create Datasets that Consist of Pre-processed and New Features.
df_train_PP = train_y.to_frame().\
              merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_PP  = test_y.to_frame().\
              merge(test_X_PP, how='inner', left_index=True, right_index=True)

# Save Data for Feature Engineering
# Pre-processed data with new features
df_train_PP.to_pickle('data_new/CLUSTER_df_train_Fios_PP.pkl')
df_test_PP.to_pickle('data_new/CLUSTER_df_test_Fios_PP.pkl')

print(df_train_PP.shape)
print(df_test_PP.shape)

del df_train_PP, df_test_PP

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(687967, 1022)
TRAIN: After Transformation: (687967, 1022)
TEST:  After Transformation: (678079, 1022)
(687967, 1023)
(678079, 1023)


### Fios ONT

In [3]:
%%time

df_train = pd.read_pickle('data_new/CLUSTER_df_train_Fios ONT Competitive Area.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_Fios ONT Competitive Area.pkl')

# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

del df_train, df_test



### Pre-Processing
# (1) Make a Pipeline and Instantiate
Pipe_PP = PP.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# Create Datasets that Consist of Pre-processed and New Features.
df_train_PP = train_y.to_frame().\
              merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_PP  = test_y.to_frame().\
              merge(test_X_PP, how='inner', left_index=True, right_index=True)

# Save Data for Feature Engineering
# Pre-processed data with new features
df_train_PP.to_pickle('data_new/CLUSTER_df_train_FiosONT_PP.pkl')
df_test_PP.to_pickle('data_new/CLUSTER_df_test_FiosONT_PP.pkl')

print(df_train_PP.shape)
print(df_test_PP.shape)

del df_train_PP, df_test_PP

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(178486, 1022)
TRAIN: After Transformation: (178486, 1022)
TEST:  After Transformation: (185586, 1022)
(178486, 1023)
(185586, 1023)


### Pre-Fios

In [4]:
%%time

df_train = pd.read_pickle('data_new/CLUSTER_df_train_Pre-Fios Competitive Area.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_Pre-Fios Competitive Area.pkl')

# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

del df_train, df_test



### Pre-Processing
# (1) Make a Pipeline and Instantiate
Pipe_PP = PP.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# Create Datasets that Consist of Pre-processed and New Features.
df_train_PP = train_y.to_frame().\
              merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_PP  = test_y.to_frame().\
              merge(test_X_PP, how='inner', left_index=True, right_index=True)

# Save Data for Feature Engineering
# Pre-processed data with new features
df_train_PP.to_pickle('data_new/CLUSTER_df_train_PreFios_PP.pkl')
df_test_PP.to_pickle('data_new/CLUSTER_df_test_PreFios_PP.pkl')

print(df_train_PP.shape)
print(df_test_PP.shape)

del df_train_PP, df_test_PP

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(182762, 1022)
TRAIN: After Transformation: (182762, 1022)
TEST:  After Transformation: (180628, 1022)
(182762, 1023)
(180628, 1023)


### Non-Competitive Area

In [5]:
%%time

df_train = pd.read_pickle('data_new/CLUSTER_df_train_Non-Competitive Area.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_Non-Competitive Area.pkl')

# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

del df_train, df_test



### Pre-Processing
# (1) Make a Pipeline and Instantiate
Pipe_PP = PP.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# Create Datasets that Consist of Pre-processed and New Features.
df_train_PP = train_y.to_frame().\
              merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_PP  = test_y.to_frame().\
              merge(test_X_PP, how='inner', left_index=True, right_index=True)

# Save Data for Feature Engineering
# Pre-processed data with new features
df_train_PP.to_pickle('data_new/CLUSTER_df_train_Non_PP.pkl')
df_test_PP.to_pickle('data_new/CLUSTER_df_test_Non_PP.pkl')

print(df_train_PP.shape)
print(df_test_PP.shape)

del df_train_PP, df_test_PP

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(477817, 1022)
TRAIN: After Transformation: (477817, 1022)
TEST:  After Transformation: (477088, 1022)
(477817, 1023)
(477088, 1023)


### U-verse

In [6]:
%%time

df_train = pd.read_pickle('data_new/CLUSTER_df_train_U-verse Competitive Area.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_U-verse Competitive Area.pkl')

# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

del df_train, df_test



### Pre-Processing
# (1) Make a Pipeline and Instantiate
Pipe_PP = PP.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# Create Datasets that Consist of Pre-processed and New Features.
df_train_PP = train_y.to_frame().\
              merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_PP  = test_y.to_frame().\
              merge(test_X_PP, how='inner', left_index=True, right_index=True)

# Save Data for Feature Engineering
# Pre-processed data with new features
df_train_PP.to_pickle('data_new/CLUSTER_df_train_Uverse_PP.pkl')
df_test_PP.to_pickle('data_new/CLUSTER_df_test_Uverse_PP.pkl')

print(df_train_PP.shape)
print(df_test_PP.shape)

del df_train_PP, df_test_PP

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(135316, 1022)
TRAIN: After Transformation: (135316, 1022)
TEST:  After Transformation: (134823, 1022)
(135316, 1023)
(134823, 1023)
