In [107]:
from pandas2arff import *
import numpy as np
import pandas as pd

In [84]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
# Delete unused attributes
del train['PassengerId']
del train['Name']
del train['Ticket']
del train['Cabin']

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [86]:
# Convert all to nominal
print(">> Before")
print(train.dtypes)

train['Survived'] = train['Survived'].astype('category')
train['Pclass'] = train['Pclass'].astype('category')
train['Sex'] = train['Sex'].astype('category')
train['SibSp'] = train['SibSp'] > 0
train['Parch'] = train['Parch'] > 0
train['Embarked'] = train['Embarked'].astype('category')

print(">> After")
print(train.dtypes)

>> Before
Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object
>> After
Survived    category
Pclass      category
Sex         category
Age          float64
SibSp           bool
Parch           bool
Fare         float64
Embarked    category
dtype: object


In [87]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,True,False,7.25,S
1,1,1,female,38.0,True,False,71.2833,C
2,1,3,female,26.0,False,False,7.925,S
3,1,1,female,35.0,True,False,53.1,S
4,0,3,male,35.0,False,False,8.05,S


In [88]:
train.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [89]:
def scale(x):
    if x < 6:
        return '0-6'
    elif x < 13:
        return '6-13'
    elif x < 20:
        return '13-20'
    elif x < 28:
        return '20-28'
    elif x < 38:
        return '28-38'
    else:
        return '38-80'

train['Age'] = train['Age'].apply(scale)

In [90]:
print(train.Age.unique())

['20-28' '38-80' '28-38' '0-6' '13-20' '6-13']


In [91]:
train.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [92]:
def scaleFare(x):
    minFare = train.Fare.min()
    lower = train.Fare.quantile(.25)
    fifty = train.Fare.quantile(.5)
    higher = train.Fare.quantile(.75)
    maxFare = train.Fare.max()
    
    if x < lower:
        return "{:.1f}-{:.1f}".format(minFare, lower)
    elif x < fifty:
        return "{:.1f}-{:.1f}".format(lower, fifty)
    elif x < higher:
        return "{:.1f}-{:.1f}".format(fifty, higher)
    else:
        return "{:.1f}-{:.1f}".format(higher, maxFare)
    
train['Fare'] = train['Fare'].apply(scaleFare)

In [93]:
print(train.Fare.unique())

['0.0-7.9' '31.0-512.3' '7.9-14.5' '14.5-31.0']


In [94]:
train.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891,891,891,891,891,891,891,889
unique,2,3,2,6,2,2,4,3
top,0,3,male,38-80,False,False,14.5-31.0,S
freq,549,491,577,365,608,678,226,644


In [95]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,20-28,True,False,0.0-7.9,S
1,1,1,female,38-80,True,False,31.0-512.3,C
2,1,3,female,20-28,False,False,7.9-14.5,S
3,1,1,female,28-38,True,False,31.0-512.3,S
4,0,3,male,28-38,False,False,7.9-14.5,S


In [98]:
train['Age'] = train['Age'].astype('category')
train['Fare'] = train['Fare'].astype('category')
train.dtypes

Survived    category
Pclass      category
Sex         category
Age         category
SibSp           bool
Parch           bool
Fare        category
Embarked    category
dtype: object

In [109]:
pandas2arff(train[['SibSp', 'Parch']], 'titanic_nomalized.arff')

True