# Titanic Data Set, More Practice With Pandas
## Cory Nichols - MSDS

In [1]:
# represent nominal and ordinal attributes first
# how we represent data types in computer is very specific
# workbook for titanic data, about 1K people on Titanic
import pandas as pd
import numpy as np

df = pd.read_csv('data/titanic.csv')

# lets look at data types
df.head()
# class, name, sex, age, sibiling or spouse #, ticket, fare, cabin, embarked

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [3]:
# all continuous attributes
# describes data types, floats represent continuous attributes: interval and ratio
# ints represent ordinals normally
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [4]:
# percentage of people that died on titanic
# length of data frame where people died divided by total amt
float(len(df[df.Survived==0]))/ len(df) *100

61.61616161616161

In [5]:
# group by class person came from
# not a complete list, but a good representation of 1K+ passengers
df_grouped = df.groupby('Pclass')
# get the sum of those who survived
print df_grouped.Survived.sum() # where 1 = survived or event
print '----------------------------'
# count all of the records where we have a survived status
print df_grouped.Survived.count()
print '----------------------------'
# get percentage of people who survived by what class theyre in and what ticket was purchased
print (df_grouped.Survived.sum() / df_grouped.Survived.count()) * 100

Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64
----------------------------
Pclass
1    216
2    184
3    491
Name: Survived, dtype: int64
----------------------------
Pclass
1    62.962963
2    47.282609
3    24.236253
Name: Survived, dtype: float64


In [6]:
# how to break up the age variable into buckets? use pd.cut
# for 3 buckets, must have 4 numbers in range in this case to set boundaries
# e.g., children are 0 to 15, adults 16 to 64, seniors 65 and older (1e6 so 1000000 1M)
df['age_range'] = pd.cut(df.Age,[0, 16, 65, 1e6],3, labels=['child', 'adult', 'senior'])
df.age_range.describe()
# 714 records are classified
# 3 unique types (child, adult, senior)
# top frequency category is adult with 606 instances or objects

count       714
unique        3
top       adult
freq        606
Name: age_range, dtype: object

In [7]:
# now lets group by class and age range, then look at survival rate
df_grouped = df.groupby(['Pclass', 'age_range'])
print 'Percentage of survivors in each group'
print df_grouped.Survived.sum() / df_grouped.Survived.count() * 100

Percentage of survivors in each group
Pclass  age_range
1       child        88.888889
        adult        65.317919
        senior       25.000000
2       child        90.476190
        adult        42.666667
        senior        0.000000
3       child        40.000000
        adult        20.141343
        senior        0.000000
Name: Survived, dtype: float64


### Dealing with Missing Values

In [8]:
# missing values always common, third class left out, data may not be pertinent, not applicable, eg childrens income
# must take statistics into account for machine learning algorithms
# imputation based on specific cuts of the data, for instance based on class and sibling spouse
# data may not be collected, e.g. third class details seem to be lacking
# how do we fill in the values for age based on other demographics?
del df['PassengerId']
del df['Name']
del df['Cabin']
del df['Ticket']

df_grouped = df.groupby(['Pclass', 'SibSp'])
print df_grouped.describe()

                           Age        Fare       Parch    Survived
Pclass SibSp                                                      
1      0     count  113.000000  137.000000  137.000000  137.000000
             mean    39.181416   75.223356    0.270073    0.562044
             std     14.844591   87.103081    0.575270    0.497956
             min      4.000000    0.000000    0.000000    0.000000
             25%     28.000000   27.720800    0.000000    0.000000
             50%     37.000000   39.600000    0.000000    1.000000
             75%     50.000000   80.000000    0.000000    1.000000
             max     80.000000  512.329200    2.000000    1.000000
       1     count   65.000000   71.000000   71.000000   71.000000
             mean    37.414154   88.492021    0.422535    0.746479
             std     14.690355   40.307129    0.786711    0.438123
             min      0.920000   39.600000    0.000000    0.000000
             25%     28.000000   56.414600    0.000000    0.50

In [9]:
# imputation: lambda function fills in missing values
# lambda is an anonymous function
# go into each group, call lambda
# call grp.fillna, any time we see a # fill with median of group to class
# My takeout so far is that .transform will work (or deal) with Series (columns) in isolation from each other. 
# transform will look at the dataframe columns one by one and return back a series (or group of series) 'made' 
# of scalars which are repeated len(input_column) times.
# transform gets rid of data grouping to give original data set in original ordering
# http://stackoverflow.com/questions/27517425/apply-vs-transform-on-a-group-object
df_imputed = df_grouped.transform(lambda grp: grp.fillna(grp.median()))
print df_imputed
# add back in PClass and SibSp
df_imputed[['Pclass', 'SibSp']] = df[['Pclass','SibSp']]
df_imputed['Sex'] = df['Sex']

print df_imputed.info()

     Survived  Age  Parch      Fare
0           0   22      0    7.2500
1           1   38      0   71.2833
2           1   26      0    7.9250
3           1   35      0   53.1000
4           0   35      0    8.0500
5           0   26      0    8.4583
6           0   54      0   51.8625
7           0    2      1   21.0750
8           1   27      2   11.1333
9           1   14      0   30.0708
10          1    4      1   16.7000
11          1   58      0   26.5500
12          0   20      0    8.0500
13          0   39      5   31.2750
14          0   14      0    7.8542
15          1   55      0   16.0000
16          0    2      1   29.1250
17          1   30      0   13.0000
18          0   31      0   18.0000
19          1   26      0    7.2250
20          0   35      0   26.0000
21          1   34      0   13.0000
22          1   15      0    8.0292
23          1   28      0   35.5000
24          0    8      1   21.0750
25          1   38      5   31.3875
26          0   26      0   

In [10]:
df_imputed['age_range'] = pd.cut(df_imputed.Age,[0,16,65,1e6], 3, labels = ['child', 'adult', 'senior'])
print df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived     891 non-null int64
Age          884 non-null float64
Parch        891 non-null int64
Fare         891 non-null float64
Pclass       891 non-null int64
SibSp        891 non-null int64
Sex          891 non-null object
age_range    884 non-null category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 56.6+ KB
None


In [11]:
# eliminate entries that still have empty values
# if missing value, delete it
df_imputed.dropna(inplace=True)
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 884 entries, 0 to 890
Data columns (total 8 columns):
Survived     884 non-null int64
Age          884 non-null float64
Parch        884 non-null int64
Fare         884 non-null float64
Pclass       884 non-null int64
SibSp        884 non-null int64
Sex          884 non-null object
age_range    884 non-null category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 56.1+ KB


In [12]:
# imputed falls off like a poop
df_grouped = df_imputed.groupby(['Pclass','age_range'])
print 'Percentage of Survivors in Each Group, With Imputed Values'
print df_grouped.Survived.sum() / df_grouped.Survived.count() * 100

Percentage of Survivors in Each Group, With Imputed Values
Pclass  age_range
1       child        88.888889
        adult        62.561576
        senior       25.000000
2       child        90.476190
        adult        42.236025
        senior        0.000000
3       child        37.837838
        adult        22.303922
        senior        0.000000
Name: Survived, dtype: float64


In [13]:
# normalize all of the attributes to fit on SAME SCALE: in this case between 0 and 1, primitive way
# by taking the value - the minimum of all of the values / max of all values - min of all values
# e.g. min not be zero or max be 10,000 on one var and min of 5MM and max of 100MM on another
# basically places all variables on same scale
# subtract off min, divide by max
# everything b/w 0 and 1 using this method
# data doesnt have to fit completely into memory to call the functions
# however, normally we need zero mean and unit SD, truly normal data set, divide by the standard deviation
# but here is this method anyway!!11

df_sub = df_imputed[['Survived','Age','Pclass','Fare']]
df_normalized = (df_sub-df_sub.min())/(df_sub.max()-df_sub.min()) # call min and max on top of DF

print df_normalized.describe()

         Survived         Age      Pclass        Fare
count  884.000000  884.000000  884.000000  884.000000
mean     0.386878    0.362684    0.651584    0.062281
std      0.487311    0.167234    0.418550    0.097161
min      0.000000    0.000000    0.000000    0.000000
25%      0.000000    0.271174    0.500000    0.015412
50%      0.000000    0.321438    1.000000    0.028213
75%      1.000000    0.459663    1.000000    0.059532
max      1.000000    1.000000    1.000000    1.000000


In [37]:
# let's normalize data with a mean of 0 and standard deviation of 0
from sklearn.preprocessing import StandardScaler

# create a pure data matrix with values
df_matrix = df_imputed[['Survived', 'Age', 'Pclass', 'Fare']].values

# call standardscaler.fit_transform to normalize entire matrix with mean of 0 and stddev of 1
s_obj = StandardScaler()

# create a matrix of normalized values from previously untransformed values
df_matrix_norm = s_obj.fit_transform(df_matrix)

# call axis = 0 for each variables' statistics
#print np.std(df_matrix_norm, axis = 0) # 0 is column, 1 is row, nothing is aggregate
#print np.mean(df_matrix_norm, axis= 0)

print df_matrix_norm.std(axis=0)
print df_matrix_norm.mean(axis=0)

# lets fit the other data and transform a new data set with it
df_fitter = s_obj.fit(df_matrix)
test= np.array([1.0,2.0,3.0,4.0])
scaler = df_fitter.transform(test)
print np.mean(scaler)

[ 1.  1.  1.  1.]
[ -4.11937955e-17   2.49172226e-16   2.13002064e-16   8.03781375e-17]
-0.130086567071


#### Representing Categorical Variables with One Hot Encoding

In [None]:
tmpdf = pd.get_dummies(df['Sex'], prefix='gender')
tmpdf.head()

In [None]:
tmpdf = pd.get_dummies(df_imputed['Pclass'], prefix = 'class')
tmpdf.head()

In [None]:
# import plotting functionality
# bar charts, scatters, advanced plots
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
# any graphics generated embedded into HTML

In [None]:
df_grouped = df_imputed.groupby(['Pclass','age_range'])
survival_rate = df_grouped.Survived.sum() / df_grouped.Survived.count()
# all we need is to plot survival_rate.plot() and pass a kind = bar horizontal
ax = survival_rate.plot(kind='barh')
print survival_rate

In [None]:
# lets use crosstab
# create a cross tab by PClass and Age_Range, with true/false for survived
survival = pd.crosstab([df_imputed['Pclass'], df_imputed['age_range']], df_imputed.Survived.astype(bool))
print survival
# # of times survived attribute was false or true
# change to percentages, divide survival type into totals
survival_rate = survival.div(survival.sum(1).astype(float), axis=0)
print survival_rate
survival_rate.plot(kind='barh', stacked=True, color=['black','gold'])

In [None]:
#df_imputed
survival_counts = pd.crosstab([df_imputed['Pclass'], df_imputed['Sex']], df_imputed.Survived.astype(bool))
print survival_counts
survival_counts.plot(kind='bar', stacked=True, color = ['black','gold'])

survival_rate = survival.div(survival.sum(1).astype(float), axis = 0)
survival_rate.plot(kind = 'barh', stacked=True, color = ['black', 'gold'])

In [None]:
# creating box plots
# only fare and age are continuous and can really be analyzed with boxplot
ax = df_imputed.boxplot()

In [None]:
# compute distributions with categories
# lets create boxplots of fare by Pclass
ax = df_imputed.boxplot(column='Fare', by ='Pclass')
df_imputed.columns

In [None]:
# lets plot multiple boxplots in a subplot figure with matplotlib

vars_to_plot_separate = [['Survived', 'SibSp', 'Pclass'],
                          ['Parch'],
                          ['Age'],
                          ['Fare']]
plt.figure(figsize=(10,6))
for index, plot_vars in enumerate(vars_to_plot_separate):
    plt.subplot(len(vars_to_plot_separate)/2,
               2,
               index+1) # 0th index takes up first subplot spot
    ax = df_imputed.boxplot(column=plot_vars)
plt.show()