In [1]:
import csv

In [2]:
import numpy as np

In [4]:
csv_file = csv.reader(open('train.csv','rb'))
data = [row for row in csv_file]

In [8]:
print data[0]

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [9]:
data = np.array(data[1:])
print len(data)
print data[0]

891
['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
 '7.25' '' 'S']


In [10]:
print data[0:5,1]

['0' '1' '1' '1' '0']


In [13]:
num_passengers = len(data)
num_survivors = sum(data[0::,1].astype(np.float))
print num_passengers, num_survivors

891 342.0


In [14]:
proportion_survivors = num_survivors / num_passengers
print proportion_survivors

0.383838383838


In [25]:
women_only = data[0::,4] == "female"
men_only = data[0::,4] != "female"

In [26]:
women_passengers = data[women_only,1].astype(np.float)
men_passengers = data[men_only,1].astype(np.float)

proportion_women_survivors = sum(women_passengers) / np.size(women_passengers)
proportion_men_survivors = sum(men_passengers) / np.size(men_passengers)

print proportion_women_survivors, proportion_men_survivors

0.742038216561 0.188908145581


In [46]:
test_file = open('test.csv','rb')
reader = csv.reader(test_file)
header = reader.next()

In [47]:
prediction_file = open('genderbasedmodel.csv','wb')
writer = csv.writer(prediction_file)

In [48]:
writer.writerow(["PassengerId", "Survived"])
for row in reader:
    if row[3] == 'female':
        writer.writerow([row[0],'1'])
    else:
        writer.writerow([row[0],'0'])
        
test_file.close()
prediction_file.close()

In [50]:
fare_ceiling = 40
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9] = fare_ceiling - 1.0

fare_bracket_size = 10
num_price_brackets = fare_ceiling / fare_bracket_size

num_classes = len(np.unique(data[0::,2]))

survival_table = np.zeros((2, num_classes, num_price_brackets))

In [51]:
print num_price_brackets, num_classes

4 3


In [52]:
print data[0:5]

[['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
  '7.25' '' 'S']
 ['2' '1' '1' 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
  'female' '38' '1' '0' 'PC 17599' '39.0' 'C85' 'C']
 ['3' '1' '3' 'Heikkinen, Miss. Laina' 'female' '26' '0' '0'
  'STON/O2. 3101282' '7.925' '' 'S']
 ['4' '1' '1' 'Futrelle, Mrs. Jacques Heath (Lily May Peel)' 'female' '35'
  '1' '0' '113803' '39.0' 'C123' 'S']
 ['5' '0' '3' 'Allen, Mr. William Henry' 'male' '35' '0' '0' '373450'
  '8.05' '' 'S']]


In [56]:
for i in xrange(num_classes):
    for j in xrange(num_price_brackets):
        
        women_only = data[ \
                          (data[0::,4] == 'female') & \
                         (data[0::,2].astype(np.float) == i+1) & \
                         (data[0::,9].astype(np.float) >= j*fare_bracket_size) & \
                         (data[0::,9].astype(np.float) < (j+1)*fare_bracket_size) \
                         , 1]
        
        men_only = data[ \
                          (data[0::,4] == 'male') & \
                         (data[0::,2].astype(np.float) == i+1) & \
                         (data[0::,9].astype(np.float) >= j*fare_bracket_size) & \
                         (data[0::,9].astype(np.float) < (j+1)*fare_bracket_size) \
                         , 1]
        
        survival_table[0,i,j] = np.mean(women_only.astype(np.float))
        survival_table[1,i,j] = np.mean(men_only.astype(np.float))
        
survival_table[ survival_table != survival_table ] = 0.

In [57]:
print survival_table

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


In [60]:
binary_survival_table = np.zeros((2,num_classes,num_price_brackets))
binary_survival_table[survival_table < 0.5] = 0
binary_survival_table[survival_table >= 0.5] = 1

In [61]:
print binary_survival_table

[[[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


In [67]:
test_file = open('test.csv','rb')
reader = csv.reader(test_file)
headers = reader.next()

In [63]:
predictions_file = open('genderclassmodel.csv','wb')
writer = csv.writer(predictions_file)
writer.writerow(['PassengerId','Survived'])

In [66]:
for row in reader:
    for j in xrange(num_price_brackets):
        try:
            row[8] = float(row[8])
        except:
            bin_fare = 3 - float(row[1])
            break
        
        if row[8] > fare_ceiling:
            bin_fare = num_price_brackets - 1
            break
        
        if (row[8] > j*fare_bracket_size) & (row[8] <= (j+1)*fare_bracket_size):
            bin_fare = j
            break
        
    if row[3] == 'female':
        writer.writerow([row[0], '%d' % \
                         int(binary_survival_table[0, float(row[1])-1, bin_fare])])
    else:
        writer.writerow([row[1], '%d' % \
                         int(binary_survival_table[0, float(row[1])-1, bin_fare])])
        
test_file.close()
predictions_file.close()

ValueError: I/O operation on closed file

In [65]:
print survival_table

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]
