In [6]:
%matplotlib inline
import csv as csv
import numpy as np

csv_file = csv.reader(open('train.csv','r'))
header = next(csv_file) # Skip first line
data = []
for row in csv_file:
    data.append(row)
data = np.array(data)

In [7]:
data

array([['1', '0', '3', ..., '7.25', '', 'S'],
       ['2', '1', '1', ..., '71.2833', 'C85', 'C'],
       ['3', '1', '3', ..., '7.925', '', 'S'],
       ..., 
       ['889', '0', '3', ..., '23.45', '', 'S'],
       ['890', '1', '1', ..., '30', 'C148', 'C'],
       ['891', '0', '3', ..., '7.75', '', 'Q']], 
      dtype='<U82')

In [8]:
data[0]

array(['1', '0', '3', 'Braund, Mr. Owen Harris', 'male', '22', '1', '0',
       'A/5 21171', '7.25', '', 'S'], 
      dtype='<U82')

In [9]:
data[-1]

array(['891', '0', '3', 'Dooley, Mr. Patrick', 'male', '32', '0', '0',
       '370376', '7.75', '', 'Q'], 
      dtype='<U82')

In [14]:
#data[0:,4] == data[0::,4]
#data[0:,4] == data[:,4]

In [16]:
n_passengers = np.size(data[:,1])
n_survived = np.sum(data[:,1].astype(np.float))
propn_survivors = n_survived / n_passengers
propn_survivors

0.38383838383838381

In [20]:
women = data[:,4] == 'female'
men = data[:,4] == 'male'
rows_women = data[women,1].astype(np.int)
rows_men = data[men,1].astype(np.int)

propn_women_survived = np.sum(rows_women) / np.size(rows_women)
propn_men_survived = np.sum(rows_men) / np.size(rows_men)

In [22]:
print(propn_men_survived)
print(propn_women_survived)

0.188908145581
0.742038216561


In [35]:
test_file = open('test.csv')
test_file_reader = csv.reader(test_file)
header = next(test_file_reader)

predn_file = open('genderbasedmodel.csv','w', encoding='UTF-8')
predn_file_writer = csv.writer(predn_file)

In [36]:
print(header)
predn_file_writer.writerow(['PassengerId','Survived'])
for row in test_file_reader:
    if row[3] == 'female':
        predn_file_writer.writerow([row[0],1])
    else:
        predn_file_writer.writerow([row[0],0])

test_file.close()
predn_file.close()

['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Now, let's try some more complicated conditions
https://www.kaggle.com/c/titanic/details/getting-started-with-python

In [45]:
# Gender, class and ticket price. bin up ticket price into four bins
import pandas as pd
print(data[:10,-3])
print(pd.cut(data[:10,-3].astype(np.float),4,retbins=True))

['7.25' '71.2833' '7.925' '53.1' '8.05' '8.4583' '51.8625' '21.075'
 '11.1333' '30.0708']
(  (7.186, 23.258]
 (55.275, 71.283]
  (7.186, 23.258]
 (39.267, 55.275]
  (7.186, 23.258]
  (7.186, 23.258]
 (39.267, 55.275]
  (7.186, 23.258]
  (7.186, 23.258]
 (23.258, 39.267]
Levels (4): Index(['(7.186, 23.258]', '(23.258, 39.267]',
                   '(39.267, 55.275]', '(55.275, 71.283]'], dtype=object), array([  7.1859667,  23.258325 ,  39.26665  ,  55.274975 ,  71.2833   ]))


In [60]:
#binning
fare_ceiling = 40
data[ data[:,-3].astype(np.float) >= fare_ceiling, -3 ] = fare_ceiling - 1.0
fare_bracket_size = 10
n_brackets = int(fare_ceiling/fare_bracket_size)
#n_classes = 3
n_classes = len(np.unique(data[:,2]))

In [61]:
survival_table = np.zeros([2,n_classes,n_brackets])

In [66]:
for i in range(n_classes):
    for j in range(n_brackets):
        w_stats = data[
            (data[:,4] == 'female')
            & (data[:,2].astype(np.float) == i+1)
            & (data[:,-3].astype(np.float) >= j*fare_bracket_size)
            & (data[:,-3].astype(np.float) < (j+1)*fare_bracket_size)
            ,1]
        m_stats = data[
            (data[:,4] == 'male')
            & (data[:,2].astype(np.float) == i+1)
            & (data[:,-3].astype(np.float) >= j*fare_bracket_size)
            & (data[:,-3].astype(np.float) < (j+1)*fare_bracket_size)
            ,1]
        survival_table[0,i,j] = np.mean(w_stats.astype(np.float))
        survival_table[1,i,j] = np.mean(m_stats.astype(np.float))



In [67]:
survival_table[0]

array([[        nan,         nan,  0.83333333,  0.97727273],
       [        nan,  0.91428571,  0.9       ,  1.        ],
       [ 0.59375   ,  0.58139535,  0.33333333,  0.125     ]])

In [69]:
# Fix NaN values by the below method
survival_table[ survival_table != survival_table ] = 0.
survival_table[0]

array([[ 0.        ,  0.        ,  0.83333333,  0.97727273],
       [ 0.        ,  0.91428571,  0.9       ,  1.        ],
       [ 0.59375   ,  0.58139535,  0.33333333,  0.125     ]])

In [70]:
# create a model with a cut-off probability of 0.5
survival_table[survival_table >= 0.5] = 1   #survive
survival_table[survival_table < 0.5] = 0    #will die
survival_table

array([[[ 0.,  0.,  1.,  1.],
        [ 0.,  1.,  1.,  1.],
        [ 1.,  1.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.]]])

In [72]:
test_file = open('test.csv')
test_file_reader = csv.reader(test_file)
header = next(test_file_reader)
predn_file = open('genderclassmodel.csv','w')
predn_file_writer = csv.writer(predn_file)
predn_file_writer.writerow(['PassengerId','Survived'])

22

In [73]:
for row in test_file_reader:
    for j in range(n_brackets):
        try:
            row[8] = float(row[8])
        except:
            bin_fare = 3 - float(row[1])
            break
        if row[8] > fare_ceiling:
            bin_fare = n_brackets - 1
            break
        if row[8] >= j*fare_bracket_size and row[8] < (j+1)*fare_bracket_size:
            bin_fare = j
            break
    if row[3] == 'female':
        predn_file_writer.writerow([row[0],int(survival_table[0,float(row[1])-1,bin_fare])])
    else:
        predn_file_writer.writerow([row[0],int(survival_table[1,float(row[1])-1,bin_fare])])
test_file.close()
predn_file.close()

###Using Pandas
https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii