## Import useful stuff and define ancillary functions

In [38]:
%pylab inline
%load_ext autoreload
%autoreload 2

from __future__ import division

from collections import defaultdict, namedtuple
from datetime import datetime, timedelta
from functools import partial
import inspect
import json
import os
import re
import sys
import cPickle as pickle

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


def analyze_str_columns(cols, df, only_percent=False):
    print 'Total samples: %s' % len(df)

    for c in cols:
        print '##############################'
        VAR_df = df[[c, 'target']]
        unique_vals = VAR_df[c].unique()
        # NaNs are the only floats among the values
        non_nan = [v for v in unique_vals if type(v) == str]
        str_0 = []
        str_1 = []
        col_names = []
        for u in unique_vals:
            if type(u) == str:
                col_mask = (VAR_df[c] == u)
            else:
                col_mask = VAR_df[c].isnull()
            str_0.append(len(VAR_df[col_mask & (VAR_df['target'] == 0)]))
            str_1.append(len(VAR_df[col_mask & (VAR_df['target'] == 1)]))

            col_names.append('%s_%s'%(c,u))
        VAR_df_counts = pd.DataFrame([str_0, str_1],
                                     columns=col_names,
                                     index=pd.Index([0, 1], name='target'))
        if not only_percent:
            print "------Counts-------"
            print VAR_df_counts
        print "----Percentages----"
        print VAR_df_counts/VAR_df_counts.sum()*100



Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


`%matplotlib` prevents importing * from pylab and numpy


# Load train data
Using pandas' read_csv with all the defaults

In [2]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed_float_string.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
    PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_float_string.csv'
    TEST_PATH = r'/media/mtambos/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, index_col="ID")

  data = self._reader.read(nrows)


### Define columns

In [3]:
str_cols = [u'VAR_0001', u'VAR_0005', u'VAR_0044',
            u'VAR_0200', u'VAR_0202', u'VAR_0214',
            u'VAR_0216', u'VAR_0222', u'VAR_0237',
            u'VAR_0274', u'VAR_0283', u'VAR_0305',
            u'VAR_0325', u'VAR_0342', u'VAR_0352',
            u'VAR_0353', u'VAR_0354', u'VAR_0404',
            u'VAR_0466', u'VAR_0467', u'VAR_0493',
            u'VAR_1934']
try:
    str_cols = [c for c in str_cols if c in df.columns and df[c].dtype==np.object]
except NameError:
    pass

# See if the classes are skewed

In [5]:
neg_samples_count = len(df['target'][df['target']==0])
pos_samples_count = len(df['target'][df['target']==1])
print '%s negative samples; %.2f%% of total' % (neg_samples_count, neg_samples_count/len(df)*100)
print '%s positive samples; %.2f%% of total' % (pos_samples_count, pos_samples_count/len(df)*100)

111458 negative samples; 76.75% of total
33773 positive samples; 23.25% of total


## Cast string columns as string and make 'null' data uniform (instead of nan, -1, [], etc.)

In [4]:
def filter_str(str_cell):
    str_cell = re.sub(r'[\W_]+', ' ', str(str_cell))
    str_cell = str_cell.strip().lower()
    if str_cell in ('1', '-1', '[]', 'nan', ''):
        return None
    else:
        return str_cell

df[str_cols] = df[str_cols].astype(np.str).applymap(filter_str)
df[str_cols]

Unnamed: 0_level_0,VAR_0001,VAR_0005,VAR_0044,VAR_0200,VAR_0202,VAR_0214,VAR_0216,VAR_0222,VAR_0237,VAR_0274,...,VAR_0325,VAR_0342,VAR_0352,VAR_0353,VAR_0354,VAR_0404,VAR_0466,VAR_0467,VAR_0493,VAR_1934
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,h,c,,ft lauderdale,batchinquiry,,ds,c6,fl,fl,...,,cf,o,u,o,chief executive officer,,,community association manager,iaps
4,h,b,,santee,batchinquiry,,ds,c6,ca,mi,...,h,ec,o,r,r,,i,discharged,,iaps
5,h,c,,reedsville,batchinquiry,,ds,c6,wv,wv,...,r,uu,r,r,,,,,,iaps
7,h,c,,liberty,batchinquiry,,ds,c6,tx,tx,...,h,,r,r,,,,,,rcc
8,r,n,,frankfort,batchinquiry,,ds,c6,il,il,...,s,,r,u,o,,,,,branch
14,r,c,,spring,batchinquiry,,ds,c6,tx,me,...,h,fe,u,r,r,,,,,iaps
16,h,c,,gresham,batchinquiry,,ds,c6,or,ca,...,s,dc,o,o,o,,,,,iaps
20,r,b,,warner robins,batchinquiry,,ds,c6,ga,sc,...,s,ff,u,o,r,,,,,iaps
21,r,n,,san antonio,batchinquiry,,ds,c6,tx,tx,...,s,,r,u,u,,,,,mobile
22,r,n,,norristown,batchinquiry,,ds,c6,pa,pa,...,s,ee,u,u,u,,,,,branch


# Vectorize String and Datetime colums

## String columns

### See how many different values the string columns have

In [35]:
str_desc = df[str_cols].describe()
str_desc = pd.DataFrame(str_desc, columns=sorted(str_desc.columns, key=lambda c: str_desc.loc['std', c]))
str_desc

Unnamed: 0,VAR_0214,VAR_0005,VAR_0467,VAR_0353,VAR_0352,VAR_0001,VAR_1934,VAR_0354,VAR_0283,VAR_0305,VAR_0325,VAR_0237,VAR_0342,VAR_0274,VAR_0493,VAR_0404,VAR_0200
count,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0,145231.0
mean,0.000537,0.662001,0.32188,2.182723,2.008228,1.17258,1.125985,2.015279,4.273241,5.047738,5.362078,23.839256,26.243385,27.826828,20.664059,75.749227,5948.37755
std,0.066898,0.755864,0.756309,0.918042,0.928806,0.983218,1.096356,1.123611,1.432471,1.739619,2.385425,13.32285,15.853942,16.728497,87.876039,278.754038,3487.11275
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,5.0,5.0,3.0,11.0,17.0,12.0,0.0,0.0,2804.0
50%,0.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,5.0,6.0,7.0,26.0,32.0,28.0,0.0,0.0,5894.0
75%,0.0,1.0,0.0,3.0,3.0,2.0,2.0,3.0,5.0,6.0,7.0,36.0,40.0,41.0,0.0,0.0,8983.0
max,12.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,6.0,7.0,8.0,45.0,49.0,56.0,602.0,1787.0,12275.0


Column VAR_0044 has not a single value, drop it.

In [25]:
df.drop('VAR_0044', axis=1, inplace=True)
str_cols.remove('VAR_0044')

### Columns VAR_0202, VAR_0216, VAR_0222 and VAR_0466 have only one value. Check if there's some correlation between the values and the target.

#### Replace their string values for 1 if there was something in the cell, or 0 if there wasn't.

In [26]:
analyze_str_columns(['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], df)

Total samples: 145231
##############################
------Counts-------
        VAR_0202_batchinquiry  VAR_0202_None
target                                      
0                      111415             43
1                       33760             13
----Percentages----
        VAR_0202_batchinquiry  VAR_0202_None
target                                      
0                   76.745307      76.785714
1                   23.254693      23.214286
##############################
------Counts-------
        VAR_0216_ds  VAR_0216_None
target                            
0            111415             43
1             33760             13
----Percentages----
        VAR_0216_ds  VAR_0216_None
target                            
0         76.745307      76.785714
1         23.254693      23.214286
##############################
------Counts-------
        VAR_0222_c6  VAR_0222_None
target                            
0            111415             43
1             33760             13
----P

The values of these columns seem to be distributed according to the same distribution in the target column, so they're useless.

In [27]:
cols = ['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466']
df.drop(cols, axis=1, inplace=True)
for c in cols:
    str_cols.remove(c)
del cols

### Encode the labels of the rest of the columns

In [33]:
encoder = LabelEncoder()
for col in str_cols:
    df[col] = encoder.fit_transform(df[col])

# Save preprocessed data to another csv file

In [36]:
df.to_csv(PTRAIN_PATH)

In [39]:
with open('deleted_str_cols.pickle', 'wb') as fp:
    pickle.dump(['VAR_0044', 'VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], fp)