In [1]:
import csv 
import gzip 
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from collections import defaultdict

# reading in the data to get Field Names
data = []
with open('Mortality_05_UT.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    fieldNames = []
    for row in spamreader:
        fieldNames.append(row)
        break

In [2]:
# get meta data 
fieldNames = fieldNames[0]

In [3]:
# create List of dictionaries for each record in the dataset
with open('Mortality_05_UT.csv') as csvfile:
    data = []
    reader = csv.DictReader(csvfile,fieldnames=fieldNames)
    count = 0 
    for row in reader:
        if count != 0:
            data.append(row)
        count +=1
        

In [4]:
# remove records that have an empty age field 
cleanData = []
for datum in data:
    toAppend = True
    if datum['age'] is '':
        toAppend = False
        
    for k in datum.keys():
        if datum[k] is '':
            datum[k] = -100
    if toAppend:
        cleanData.append(datum)

In [5]:
# part of preprocessing 
useless = ['m_id','client_m_id','hl_id','house_no','PSU_ID','m_serial_no','date_of_death','month_of_death','year_of_death','is_death_reg','place_of_death','is_death_certificate_received','serial_num_of_infant_mother','death_period','months_of_pregnancy']

In [6]:
#part of preprocessing 
useful_stuff = ['id','state','house_hold_no','district','rural','stratum_code',\
               'deceased_sex','treatment_source','order_of_birth','sex','age',\
               'religion','marital_status','currently_attending_school', \
               'reason_for_not_attending_school','highest_qualification','occupation_status'\
              'disability_status','regular_treatment','regular_treatment_source','chew','smoke'\
              'injury_treatment_type','alcohol','house_status','owner_status','drinking_water_source'\
              'is_water_filter','water_filteration','toilet_used','is_toilet_shared','household_have_electricity'\
              'lighting_source','cooking_fuel','no_of_dwelling_rooms','kitchen_availability','is_radio',\
              'is_televison','is_computer','is_telephone', 'is_washing_machine', 'is_refrigerator', \
              'is_sewing_machine', 'is_bicycle', 'is_scooter', 'is_car', 'is_tractor', 'is_water_pump', \
              'cart', 'land_possessed', 'iscoveredbyhealthscheme', 'injury_treatment_type'\
              'house_structure']

In [7]:
# final features in consolidated data 
columns = ['id','state','house_hold_no','district','rural','stratum_code',\
'private_parts','order_of_birth','age','religion','marital_status'\
,'currently_attending_school', 'reason_for_not_attending_school','highest_qualification','occupation_status','disability_status','regular_treatment','regular_treatment_source','chew','smoke'\
,'injury_treatment_type','alcohol','house_status','owner_status','drinking_water_source'\
,'water_filteration','toilet_used','is_toilet_shared','household_have_electricity'\
,'lighting_source','cooking_fuel','no_of_dwelling_rooms','kitchen_availability','is_radio',\
'is_television','is_computer','is_telephone', 'is_washing_machine', 'is_refrigerator','is_sewing_machine', 'is_bicycle', 'is_scooter', 'is_car', 'is_tractor', 'is_water_pump', \
'cart','land_possessed','iscoveredbyhealthscheme'\
,'house_structure']




# number of categories for each type of one hot encoding 
indexDict = defaultdict(int)

indexDict['rural'] = 2
indexDict['private_parts'] = 2
indexDict['religion'] = 8
indexDict['marital_status'] = 8
indexDict['currently_attending_school'] = 3
indexDict['reason_for_not_attending_school'] = 9
indexDict['regular_treatment'] = 3
indexDict['regular_treatment_source'] = 15
indexDict['highest_qualification'] = 10
indexDict['occupation_status'] = 16
indexDict['disability_status'] = 8
indexDict['chew'] = 8
indexDict['smoke'] = 5
indexDict['injury_treatment_type'] = 8
indexDict['alcohol'] = 5
indexDict['house_status'] = 4 
indexDict['owner_status'] = 3
indexDict['drinking_water_source'] = 9
indexDict['water_filteration'] = 8
indexDict['toilet_used'] = 10
indexDict['is_toilet_shared'] = 2
indexDict['household_have_electricity'] = 2
indexDict['lighting_source'] = 6
indexDict['cooking_fuel'] = 10
indexDict['kitchen_availability'] = 5
indexDict['is_radio'] = 2
indexDict['is_television'] = 2
indexDict['is_computer'] = 3
indexDict['is_telephone'] = 4
indexDict['is_washing_machine'] = 2
indexDict['is_refrigerator'] = 2
indexDict['is_sewing_machine'] = 2
indexDict['is_bicycle'] = 2
indexDict['is_scooter'] = 2
indexDict['is_car'] = 2
indexDict['is_tractor'] = 2
indexDict['is_water_pump'] = 2
indexDict['cart'] = 4
indexDict['land_possessed'] = 6
indexDict['iscoveredbyhealthscheme'] = 3
indexDict['house_structure'] = 4

# features that don't need one hot encoding 
mansNotHot = [
 'house_hold_no',
 'stratum_code',
 'order_of_birth',
 'no_of_dwelling_rooms']

In [8]:
consolidated_data = []

for d in cleanData:
    tempDict = defaultdict(int)
    sex = None 
#     remapping categories that have something mapped to 0 
    for c in columns:
        if c is 'private_parts':
            if d['sex'] != -100:
                sex = int(d['sex'])
            elif d['deceased_sex'] != -100:
                sex = int(d['deceased_sex'])
            tempDict['private_parts'] = sex 
        elif c is 'regular_treatment_source':
            if d['regular_treatment_source']=='99':
                tempDict[c] = 14
            elif d['regular_treatment_source'] =='00':
                tempDict[c] = 15
        elif c is 'highest_qualification':
            if d['highest_qualification'] =='0':
                tempDict[c] = 10
        elif c is 'disability_status':
            if d['disability_status'] == '0':
                tempDict[c] = 8
        elif c is 'chew':
            if d['chew'] == '0':
                tempDict[c] = 8
        elif c is 'smoke':
            if d['smoke'] == '0':
                tempDict[c] = 5
        elif c is 'injury_treatment_type':
            if d['injury_treatment_type'] == '0':
                tempDict[c] = 8
        elif c is 'alcohol':
            if d['alcohol'] == '0':
                tempDict[c] = indexDict['alcohol']
        elif c is 'toilet_used':
            if d['toilet_used'] == '0':
                tempDict[c] = indexDict['toilet_used']
        elif c is 'cooking_fuel':
            if d['cooking_fuel'] == '0':
                tempDict[c] = indexDict['cooking_fuel']
        else:
            #no remapping needed 
            tempDict[c] = int(d[c])
    consolidated_data.append(tempDict)

print 'done'
            

done


In [9]:
# Neil is a beast 

#age
def checkAge(datum):
    return datum['age']

# add one hot encodings 
def appendFeats(datum,cat):
    val = int(datum[cat])
    
    retList = [0]*indexDict[cat]
    if val !=-100:
        retList[val-1] = 1
    return retList

def getAvg(cat):
    count = 0 
    num = 0 
    for d in consolidated_data:
        if int(d[cat]) != -100:
            count += int(d[cat])
            num +=1
    return 1.0*count/num
avgList = defaultdict(float)
for i in mansNotHot:
    avgList[i] = getAvg(i)

In [10]:
# for datum in cleanData, datum is a dictionary
def feature(datum):
    feat = []
    
    #not hot encoding  
    for category in mansNotHot:
        if datum[category] ==-100:
            feat.append(avgList[category])
        else:
            feat.append(datum[category])
    #one hot encoding 
    for category in indexDict.keys():
        feat += appendFeats(datum,category)
    feat.append(1)
    return feat


In [26]:
X_feat = [feature(d) for d in consolidated_data]
y_label = [checkAge(d) for d in consolidated_data]
y_svm = []
for label in y_label:
    y_svm.append(label/10)

In [12]:
from sklearn.decomposition import PCA


pca = PCA(n_components=100)
# pca.fit(X_feat)
# print "PCA:",pca.components_
# proj have 10 best features 
X_proj = pca.fit_transform(X_feat)

In [13]:
import numpy as np
import urllib
import scipy.optimize
import random

theta,residuals,rank,s = np.linalg.lstsq(X_proj, y_label)


In [14]:
print y_svm[:10]
print y_label[:10]

[1, 2, 2, 1, 2, 2, 3, 2, 2, 1]
[28, 45, 50, 36, 55, 55, 64, 49, 53, 30]


In [15]:
predictions = []
for x in X_proj:
    predictions.append(sum([xi*t for (xi,t) in zip(x,theta)]))

In [16]:
totalSE = 0 
for (a,b) in zip(predictions,y_label):
    totalSE+=(a-b)**2
mse = 1.0*totalSE/len(y_label)

In [17]:
predictions[1],y_label[1]

(-7.3901797731442276, 45)

In [18]:
#TODO: tune parameters to get better predictions, but code done 

# print X_feat[0]

In [19]:
# print X_feat[1]
max(y_label)

99

In [28]:
from sklearn import svm
# print len(X_train), len(Y_train)
# print X_train[:5]
print 'running'
clf = svm.SVC(C=0.1, kernel='linear')
clf.fit(X_feat, y_svm)



running


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
svm_predictions = clf.predict(X_feat)
print 'done'

done


In [29]:
totalSE = 0 
for (a,b) in zip(svm_predictions,y_svm):
    if a==b:
        totalSE+=1
accuracy = 1.0*totalSE/len(y_svm)

In [30]:
print accuracy

0.0494385409601


In [24]:
print svm_predictions[:20]

[2 2 1 2 2 3 2 2 2 1 2 2 3 1 3 2 2 2 2 3]


In [25]:
print y_svm[:20]

[1, 2, 2, 1, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 2, 2, 2, 2]
