In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
# load train and test file
train = pd.read_csv('./data/train.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
test = pd.read_csv('./data/test.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')

In [13]:
# number of columns
print len(train.columns)

298


## Exploratory Data Analysis

In [None]:
# check to see if there are missing values
def get_cols_with_missing_values(df):
    col_with_missing_val = df.isnull().any()
    return [col for col in col_with_missing_val.index.values if col_with_missing_val.ix[col]]

In [None]:
print get_cols_with_missing_values(train)

In [None]:
print get_cols_with_missing_values(test)

In [None]:
# lets drop missing data for now
train_with_no_missing_values = train.dropna()

In [None]:
# lets see the distribution of the target variable
train_with_no_missing_values.QuoteConversion_Flag.value_counts()

In [None]:
train.QuoteConversion_Flag.value_counts()

** Most of the positive labels have missing values. **

In [None]:
# lets find out how many categorical features are there
print (train.select_dtypes(['object']).shape[1])

** 27 of the features are categorical. **

In [None]:
categorical_features = train.select_dtypes(['object']).columns

In [None]:
categorical_features

In [None]:
def print_group_by_categorical_dist(df, categorical_features):
    for col in categorical_features:
        print df.groupby([col, 'QuoteConversion_Flag']).size()
        print '\n\n'

In [None]:
print_group_by_categorical_dist(train, categorical_features)

## Add time related features

In [None]:
train['year'] = train.Original_Quote_Date.dt.year
train['month'] = train.Original_Quote_Date.dt.month

In [None]:
y = train.groupby(['year', 'QuoteConversion_Flag']).size()
y = y.unstack(1)

In [None]:
plt.plot(y.index.values, y[0], label='Negative', c='b', alpha=0.3)
plt.plot(y.index.values, y[1], label='Positive', c='g', alpha=0.3)
plt.legend(loc='best')
plt.xlabel('Year')
plt.ylabel('Count of different results');

In [None]:
g = train.groupby(['month', 'QuoteConversion_Flag']).size()
g = g.unstack(1)

In [None]:
plt.plot(g.index.values, g[0], label='Negative', c='b', alpha=0.3)
plt.plot(g.index.values, g[1], label='Positive', c='g', alpha=0.3)
plt.legend(loc='best')
plt.xlabel('Month')
plt.ylabel('Count of different results');

In [None]:
# Relationship between year and month of original quote with conversion flag
ym = train.groupby(['year', 'month', 'QuoteConversion_Flag']).size()
ym = ym.unstack(2)

In [None]:
ym.plot();

## Are rest of the features numerical or ordinal ?

In [9]:
def type_of_features(df, col_types):
    cat_features = df.select_dtypes(col_types)
    
    for cat in cat_features:
        print 'Feature Name: {} and number of unique values {} '.format(cat, len(df[cat].unique()))

In [11]:
type_of_features(train, ['int64', 'float64', 'int32', 'float32'])

Feature Name: QuoteConversion_Flag and number of unique values 2 
Feature Name: Field7 and number of unique values 28 
Feature Name: Field8 and number of unique values 38 
Feature Name: Field9 and number of unique values 5 
Feature Name: Field11 and number of unique values 11 
Feature Name: CoverageField1A and number of unique values 26 
Feature Name: CoverageField1B and number of unique values 26 
Feature Name: CoverageField2A and number of unique values 25 
Feature Name: CoverageField2B and number of unique values 25 
Feature Name: CoverageField3A and number of unique values 25 
Feature Name: CoverageField3B and number of unique values 25 
Feature Name: CoverageField4A and number of unique values 25 
Feature Name: CoverageField4B and number of unique values 25 
Feature Name: CoverageField5A and number of unique values 3 
Feature Name: CoverageField5B and number of unique values 4 
Feature Name: CoverageField6A and number of unique values 3 
Feature Name: CoverageField6B and number of

In [10]:
type_of_features(train, ['object'])

Feature Name: Field6 and number of unique values 8 
Feature Name: Field10 and number of unique values 8 
Feature Name: Field12 and number of unique values 2 
Feature Name: CoverageField8 and number of unique values 7 
Feature Name: CoverageField9 and number of unique values 12 
Feature Name: SalesField7 and number of unique values 7 
Feature Name: PersonalField7 and number of unique values 3 
Feature Name: PersonalField16 and number of unique values 50 
Feature Name: PersonalField17 and number of unique values 66 
Feature Name: PersonalField18 and number of unique values 61 
Feature Name: PersonalField19 and number of unique values 57 
Feature Name: PropertyField3 and number of unique values 3 
Feature Name: PropertyField4 and number of unique values 3 
Feature Name: PropertyField5 and number of unique values 2 
Feature Name: PropertyField7 and number of unique values 19 
Feature Name: PropertyField14 and number of unique values 4 
Feature Name: PropertyField28 and number of unique val

In [12]:
train.SalesField8.describe()

count    260753.000000
mean      33734.858314
std       19444.146851
min           1.000000
25%       16905.000000
50%       33864.000000
75%       50617.000000
max       67164.000000
Name: SalesField8, dtype: float64