In [1]:
# configure autoreload to automatically reload modules when files are changed
%load_ext autoreload
%autoreload 2

## Imports and Configuration

In [2]:
import os, sys
import re as re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pandas_util

In [3]:
# configure various jupyter defaults
sns.set_style('whitegrid')

%matplotlib inline
plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

## Examples

Using Kaggle Titanic training dataset for examples

In [4]:
df = pd.read_csv('train.csv') #, index_col='PassengerId')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### advanced_describe

Includes non-numeric objects, missing counts/%'s, unique counts/%'s, unique values (if less than 20 unique values), and data types

In [5]:
pandas_util.advanced_describe(df)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,missing%,unique,unique%,unique_values,dtype
Age,714,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0,177,14.75,89,7.42,,float64
Cabin,204,,,,,,,,687,57.25,148,12.33,,object
Embarked,889,,,,,,,,2,0.17,4,0.33,"[S, C, Q, nan]",object
Fare,891,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329,0,0.0,248,20.67,,float64
Name,891,,,,,,,,0,0.0,891,74.25,,object
Parch,891,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0,0,0.0,7,0.58,"[0, 1, 2, 5, 3, 4, 6]",int64
PassengerId,891,446.0,257.354,1.0,223.5,446.0,668.5,891.0,0,0.0,891,74.25,,int64
Pclass,891,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0,0,0.0,3,0.25,"[3, 1, 2]",int64
Sex,891,,,,,,,,0,0.0,2,0.17,"[male, female]",object
SibSp,891,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0,0,0.0,7,0.58,"[1, 0, 3, 4, 2, 5, 8]",int64


### fillna_by_group

Replace nulls in `target_col` based on a group aggregate

In [6]:
pandas_util.fillna_by_group(df, 'Age', ['Sex', 'SibSp', 'Parch'])

In [7]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,30.0,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### add_by_regex

Extract new feature from an existing column based on a regular expression

In [8]:
pandas_util.add_by_regex(df, 'Name', 'Title', ' ([A-Za-z]+)\.')

In [9]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr
5,6,0,3,"Moran, Mr. James",male,30.0,0,0,330877,8.4583,,Q,Mr
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs
