In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from scripts.sam_value_counts import sam_dataframe_cols_value_count_analysis, sam_dataframe_markup_value_counts
from scripts.sam_confusion_matrix import sam_plot_confusion_matrix, sam_confusion_maxtrix
from scripts.sam_variance_check import get_low_variance_columns
from scripts.tools import check_metric, data_transformations, df_check_stats, game

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.decomposition import PCA

np.set_printoptions(precision=5)
np.random.seed(69572)
plt.style.use('ggplot')
sns.set(color_codes=True)

%matplotlib inline

In [2]:
import datetime

from scripts import sam_custom_labeler
from collections import defaultdict

CUST_CATEGORY_LABELER = sam_custom_labeler.CUST_CATEGORY_LABELER

def sam_pickle_save(df_x, df_y, df_test_x, prefix="tmp/Iteration1_"):
    print('SAVE PREFIX USED: ', prefix)
    pickle.dump(df_x, open(prefix + 'df_x.pkl', 'wb'))
    pickle.dump(df_y, open(prefix + 'df_y.pkl', 'wb'))
    pickle.dump(df_test_x, open(prefix + 'df_test_x.pkl', 'wb'))
    return

def sam_pickle_load(prefix='tmp/Iteration1_'):
    print('LOAD PREFIX USED: ', prefix)
    df_x = pickle.load(open(prefix + 'df_x.pkl', 'rb'))
    df_y = pickle.load(open(prefix + 'df_y.pkl', 'rb'))
    df_test_x = pickle.load(open(prefix + 'df_test_x.pkl', 'rb'))
    return df_x, df_y, df_test_x

In [4]:
crazy_list = dir()

In [20]:
for each in dir():
    if each not in crazy_list:
        del each

# Data Transformation Iteration1

Normal Transformations
* Date
* Bool
* Longi, Lati missing values
* Longi, Lati precision check

In [21]:
# data collection
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')

# df_check_stats(RAW_X, RAW_y, RAW_TEST_X)

strptime = datetime.datetime.strptime

DATE_FORMAT = "%Y-%m-%d"
REFERENCE_DATE_POINT = strptime('2000-01-01', DATE_FORMAT)

if RAW_X.date_recorded.dtype == 'O':

    # convert it to datetime format
    f = lambda x: strptime(str(x), DATE_FORMAT)
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)

    # week day
    f = lambda x: x.weekday()
    RAW_X['date_recorded_weekday'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_weekday'] = RAW_TEST_X.date_recorded.apply(f)

    # date
    f = lambda x: x.day
    RAW_X['date_recorded_date'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_date'] = RAW_TEST_X.date_recorded.apply(f)

    # month
    f = lambda x: x.month
    RAW_X['date_recorded_month'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_month'] = RAW_TEST_X.date_recorded.apply(f)

    # year
    f = lambda x: x.year
    RAW_X['date_recorded_year'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_year'] = RAW_TEST_X.date_recorded.apply(f)

    # total days
    f = lambda x: (x - REFERENCE_DATE_POINT).days
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)
    

# Longitude & Latitude -- zero values fix

# Filling Missing
if not RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'].empty:
    tmp = np.mean(RAW_X[u'latitude'][RAW_X.latitude < -1.0].values)
    RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'] = tmp
    RAW_TEST_X.loc[RAW_TEST_X.latitude >= -1.0, u'latitude'] = tmp


# Filling Missing
if not RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'].empty:
    tmp = np.mean(RAW_X[u'longitude'][RAW_X[u'longitude'] > 1.0].values)
    RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'] = tmp
    RAW_TEST_X.loc[RAW_TEST_X[u'longitude'] <= 1.0, u'longitude'] = tmp

    
# Reducing geo location precision to 11 meters
LONG_LAT_PRECISION = 0.00001
fns_lola =lambda x: (x//LONG_LAT_PRECISION) * LONG_LAT_PRECISION

# Reducing Precision of Lat.
RAW_X.longitude = RAW_X.longitude.map(fns_lola)
RAW_X.latitude = RAW_X.latitude.map(fns_lola)
RAW_TEST_X.longitude = RAW_TEST_X.longitude.map(fns_lola)
RAW_TEST_X.latitude = RAW_TEST_X.latitude.map(fns_lola)







#
#   Note Below transformation are to make sure Label Encoding works fine
#
#


# bool columns
tmp = ['public_meeting', 'permit']
RAW_X[tmp] = RAW_X[tmp].fillna(False)
RAW_TEST_X[tmp] = RAW_TEST_X[tmp].fillna(False)

# object columns list
obj_cols = RAW_X.dtypes[RAW_X.dtypes == 'O'].index.tolist()

# object columns
RAW_X[obj_cols] = RAW_X[obj_cols].fillna('Other')
RAW_TEST_X[obj_cols] = RAW_TEST_X[obj_cols].fillna('Other')


sam_pickle_save(RAW_X, RAW_y, RAW_TEST_X, prefix="tmp/Iteration2_dt1_")
df_check_stats(X, y, TEST_X)

In [32]:
RAW_X, RAW_y, RAW_TEST_X = sam_pickle_load(prefix="tmp/Iteration2_dt1_")
df_check_stats(RAW_X, RAW_y, RAW_TEST_X)

LOAD PREFIX USED:  tmp/Iteration2_dt1_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Data Frame Shape: (59400, 1) TotColumns: 1 ObjectCols: 0
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0


In [33]:
# Just assining new names to transformed dataframe pointers
X, y, TEST_X = data_transformations(RAW_X, RAW_y, RAW_TEST_X)

# benchmark
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42, stratify=y)
clf = game(X_train, X_test, y_train, y_test, algo='rf', )

------------------------------------------------
AC Score: 0.984399551066 F1 Score: 0.984469065664
------------------------------------------------
AC Score: 0.800269360269 F1 Score: 0.806787932206


In [34]:
del X, y, TEST_X

# Data Transformation Iteration1

Normal Transformations
* Date
* Bool
* Longi, Lati missing values
* Longi, Lati precision check


# Data Transformation Iteration2

* Custom Labler


In [73]:
for each in dir():
    if each not in crazy_list:
        del each

In [74]:
RAW_X, RAW_y, RAW_TEST_X = sam_pickle_load(prefix="tmp/Iteration2_dt1_")
df_check_stats(RAW_X, RAW_y, RAW_TEST_X)

LOAD PREFIX USED:  tmp/Iteration2_dt1_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Data Frame Shape: (59400, 1) TotColumns: 1 ObjectCols: 0
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0


In [75]:
def text_transformation(name):
    """Cleanup basic text issue in name(input).
    
    Removes text capitalisation, case, space and other non text ascii charecters
        except space.
    """
    if name:
        name = str(name).lower().strip()
        name = ''.join([i if 96 < ord(i) < 128 else ' ' for i in name])
        if 'and' in name:
            name = name.replace('and', ' ')

        # clear double space
        while '  ' in name:
            name = name.replace('  ', ' ')
        return name.strip()
    return ''

for col in obj_cols:
    a, b, c = (col, len(RAW_X[col].value_counts()), len(RAW_TEST_X[col].value_counts()))
    RAW_X[col] = RAW_X[col].apply(text_transformation)
    RAW_TEST_X[col] = RAW_TEST_X[col].apply(text_transformation)
    d, e = len(RAW_X[col].value_counts()), len(RAW_TEST_X[col].value_counts())
    print(a, b - d, c - e)

funder 17 8
installer 279 145
wpt_name 683 144
basin 0 0
subvillage 112 37
region 0 0
lga 0 0
ward 0 0
recorded_by 0 0
scheme_management 0 0
scheme_name 212 97
extraction_type 0 0
extraction_type_group 0 0
extraction_type_class 0 0
management 0 0
management_group 0 0
payment 0 0
payment_type 0 0
water_quality 0 0
quality_group 0 0
quantity 0 0
quantity_group 0 0
source 0 0
source_type 0 0
source_class 0 0
waterpoint_type 0 0
waterpoint_type_group 0 0


In [77]:
# Just assining new names to transformed dataframe pointers
X, y, TEST_X = data_transformations(RAW_X, RAW_y, RAW_TEST_X)

# benchmark
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42, stratify=y)
clf = game(X_train, X_test, y_train, y_test, algo='rf', )

------------------------------------------------
AC Score: 0.984264870932 F1 Score: 0.984335586239
------------------------------------------------
AC Score: 0.802087542088 F1 Score: 0.808550466484


### Custom Labler

In [76]:
##################################
######### IMPLEMENT ##############
#################################

custom_labler = defaultdict(CUST_CATEGORY_LABELER)
tmp = { 'funder': 97,
  'installer': 97,
  'wpt_name': 80,
  'subvillage': 80,
  'ward': 80,
  'scheme_name': 85
  }

for col, limit  in tmp.items():
    print('--------------------------------------------------------', col)
    labler = custom_labler[col]
    labler.DATA_COVERAGE_LIMIT = limit
    labler.fit(RAW_X[col])
    tmp = labler.fit_transform(RAW_X[col])
    print(len(RAW_X[col].value_counts()), len(tmp.value_counts()))
    RAW_X[col] = tmp
    tmp = labler.etransform(RAW_TEST_X[col])
    print(len(RAW_TEST_X[col].value_counts()), len(tmp.value_counts()))

print(RAW_X.shape, RAW_TEST_X.shape, all(RAW_X.columns == RAW_TEST_X.columns))

-------------------------------------------------------- funder
97 percentage of DATA coverage mean, 593 (in number) groups
1881 594
97 percentage of DATA coverage mean, 593 (in number) groups
973 535
-------------------------------------------------------- ward
80 percentage of DATA coverage mean, 998 (in number) groups
2092 999
80 percentage of DATA coverage mean, 998 (in number) groups
1959 993
-------------------------------------------------------- scheme_name
85 percentage of DATA coverage mean, 524 (in number) groups
2485 525
85 percentage of DATA coverage mean, 524 (in number) groups
1693 518
-------------------------------------------------------- installer
97 percentage of DATA coverage mean, 600 (in number) groups
1867 601
97 percentage of DATA coverage mean, 600 (in number) groups
947 521
-------------------------------------------------------- subvillage
80 percentage of DATA coverage mean, 8568 (in number) groups
19176 8569
80 percentage of DATA coverage mean, 8568 (in nu

In [78]:
sam_pickle_save(RAW_X, RAW_y, RAW_TEST_X, prefix="tmp/Iteration2_dt2_")
df_check_stats(X, y, TEST_X)

SAVE PREFIX USED:  tmp/Iteration2_dt2_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Numpy Array Size: 59400
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0


In [58]:
# Just assining new names to transformed dataframe pointers
X, y, TEST_X = data_transformations(RAW_X, RAW_y, RAW_TEST_X)

# benchmark
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42, stratify=y)
clf = game(X_train, X_test, y_train, y_test, algo='rf', )

------------------------------------------------
AC Score: 0.985274971942 F1 Score: 0.985335201609
------------------------------------------------
AC Score: 0.797104377104 F1 Score: 0.803445444177


In [79]:
sam_pickle_save(X, y, TEST_X, prefix="tmp/Iteration2_final_")
df_check_stats(X, y, TEST_X)

SAVE PREFIX USED:  tmp/Iteration2_final_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Numpy Array Size: 59400
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0


In [81]:
X, y, TEST_X = sam_pickle_load(prefix="tmp/Iteration2_final_")
df_check_stats(X, y, TEST_X)

LOAD PREFIX USED:  tmp/Iteration2_final_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Numpy Array Size: 59400
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0
