# Satellite Image Data Set

## Data Preprocessing
Missing Values - Replace by mean, Replace by Neighbourhood, Row-wise, Row Spectral Wise
Use Multi-class SVM with different kernels, Multi Class Learning
Stratified K-Fold validation for equal number of classes in each fold
https://archive.ics.uci.edu/ml/datasets/Statlog+(Landsat+Satellite)

## Read the input data

In [102]:
import numpy as np

def readInputFiles():
    rawTrainData = np.loadtxt('sat_noisy.trn',delimiter=',')
    rawTestData = np.loadtxt('sat-test.csv.dat',delimiter=',')
    # print rawTrainData.shape
    # print rawTestData.shape
    # print rawTrainData[0]
    YTrain = rawTrainData[:,-1]
    XTrain = rawTrainData[:,:-1]
    # print YTrain.shape
    # print XTrain.shape

    return XTrain, YTrain, rawTestData
 

## Handle Missing Values (NaN)

In [127]:

# Replacing NaN
# Replace by column mean, by zero, row-wise, row spectral-wise
import  scipy.stats as stats
def replaceMissingValues(X, replacement):
    func = None
    if replacement == 'colmean':
        func = stats.nanmean(X,axis=0)
    elif replacement == 'colmedian':
        func = stats.nanmedian(X,axis=0)
    elif replacement =='zeros':
        pass
    elif replacement =='rowmean':
        func = stats.nanmean(X,axis=1)
    elif replacement =='rowmedian':
        func = stats.nanmedian(X,axis=1)
    elif replacement =='rowspectralmean':
        XSpectrum = np.hsplit(X,4)
        XSpectrum1 = replaceMissingValues(XSpectrum[0], "rowmean")
        XSpectrum2 = replaceMissingValues(XSpectrum[1], "rowmean")
        XSpectrum3 = replaceMissingValues(XSpectrum[2], "rowmean")
        XSpectrum4 = replaceMissingValues(XSpectrum[3], "rowmean")
        func = np.hstack((XSpectrum1,XSpectrum2,XSpectrum3,XSpectrum4))
    elif replacement =='rowspectralmedian':
        XSpectrum = np.hsplit(X,4)
        XSpectrum1 = replaceMissingValues(XSpectrum[0], "rowmedian")
        XSpectrum2 = replaceMissingValues(XSpectrum[1], "rowmedian")
        XSpectrum3 = replaceMissingValues(XSpectrum[2], "rowmedian")
        XSpectrum4 = replaceMissingValues(XSpectrum[3], "rowmedian")
        func = np.hstack((XSpectrum1,XSpectrum2,XSpectrum3,XSpectrum4))
    
    inds = np.where(np.isnan(X))
    #X[inds]=np.take(func,inds[0])
    
    #print "X before replace", X[1][10]
    
    if replacement == 'colmean' or replacement == 'colmedian':
        X[inds[0],inds[1]] = func[inds[1]]
    elif replacement == 'rowmean' or replacement == 'rowmedian':
        X[inds[0],inds[1]] = func[inds[0]]
    elif replacement =='zeros':
        X[inds[0],inds[1]]= 0
        
    #print "X after replace", X[1][10]    
    #print "inds", (inds)
    #print "func", repr(func)
    return X

In [104]:
# #Multi Class SVM
# from sklearn import svm
# # clf = svm.SVC(decision_function_shape='ovo')
# # clf = svm.SVC(kernel='linear',probability=True)
# clf.fit(XTrain, YTrain) 
# # dec = clf.decision_function(XTest)
# # print dec.shape[1] # 6 classes: 6*5/2 = 15

# # clf.decision_function_shape = "ova"
# # dec = clf.decision_function(XTest)
# # dec = clf.predict(XTest)
# # print dec.shape[1] # 6 classes
# # YPred = clf.predict_proba(XTest)
# YPred = clf.predict(XTest)
# print YPred
# # predictedClasses =YPred.argmax(axis=1)
# # print predictedClasses

In [131]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
def classifyOneVsRestClassifier(XTrain, XTest, YTrain, YTest):
    YPred = OneVsRestClassifier(LinearSVC(random_state=0)).fit(XTrain, YTrain).predict(XTest)
    diff = YPred - YTest
#     print diff
    score = diff[diff == 0].size
#     print score
#     print YPred.size
    return (100.0 * score)/(YPred.size)

In [132]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
def classifyOneVsOneClassifier(XTrain, XTest, YTrain, YTest):
    YPred = OneVsOneClassifier(LinearSVC(random_state=0)).fit(XTrain, YTrain).predict(XTest)
    diff = YPred - YTest
    score = diff[diff == 0].size
    return (100.0 * score)/(YPred.size)

In [133]:
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
def classifyOutputCodeClassifier(XTrain, XTest, YTrain, YTest):
    clf = OutputCodeClassifier(LinearSVC(random_state=0),code_size=2, random_state=0)
    YPred = clf.fit(XTrain, YTrain).predict(XTest)
    diff = YPred - YTest
    score = diff[diff == 0].size
    return (100.0 * score)/(YPred.size)

In [134]:
from sklearn.ensemble import RandomForestClassifier
def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest):
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(XTrain, YTrain)
    YPred = clf.predict(XTest)
    diff = YPred - YTest
    score = diff[diff == 0].size
    return (100.0 * score)/(YPred.size)

In [135]:
from sklearn.cross_validation import KFold
def kFoldCrossVal(XTrain, YTrain, classify):
    n_folds=5
    kf = KFold(4435, n_folds)
    score = 0.0
    for train, test in kf:
        score += classify(XTrain[train,:], XTrain[test,:], YTrain[train], YTrain[test])

    return score/n_folds

In [136]:
# Replace missing values (NaN) with Zero
XTrain, YTrain, rawTestData = readInputFiles()
XTrain = replaceMissingValues(XTrain,'zeros')
XTest = replaceMissingValues(rawTestData,'zeros')
score = kFoldCrossVal(XTrain, YTrain, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain, YTrain, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain, YTrain, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain, YTrain, classifyRandomForestClassifier)
print score3

36.7305524239
33.7542277339
41.6685456595
82.1420518602


In [138]:
XTrain1, YTrain1, rawTestData1 = readInputFiles()
XTrain1 = replaceMissingValues(XTrain1,'colmean')
XTest1 = replaceMissingValues(rawTestData1,'colmean')
score = kFoldCrossVal(XTrain1, YTrain1, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain1, YTrain1, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain1, YTrain1, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain1, YTrain1, classifyRandomForestClassifier)
print score3

56.3697857948
59.5490417136
57.4971815107
82.8635851184


In [139]:
XTrain2, YTrain2, rawTestData2 = readInputFiles()
XTrain2 = replaceMissingValues(XTrain2,'colmedian')
XTest2 = replaceMissingValues(rawTestData2,'colmedian')
score = kFoldCrossVal(XTrain2, YTrain2, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain2, YTrain2, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain2, YTrain2, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain2, YTrain2, classifyRandomForestClassifier)
print score3

49.9661781285
55.8962795941
46.3359639233
83.6527621195


In [140]:
XTrain3, YTrain3, rawTestData3 = readInputFiles()
XTrain3 = replaceMissingValues(XTrain3,'rowmean')
XTest3 = replaceMissingValues(rawTestData3,'rowmean')
score = kFoldCrossVal(XTrain3, YTrain3, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain3, YTrain3, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain3, YTrain3, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain3, YTrain3, classifyRandomForestClassifier)
print score3

39.5490417136
55.7609921082
49.7632468997
82.5704622322


In [141]:
XTrain4, YTrain4, rawTestData4 = readInputFiles()
XTrain4 = replaceMissingValues(XTrain4,'rowmedian')
XTest4 = replaceMissingValues(rawTestData4,'rowmedian')
score = kFoldCrossVal(XTrain4, YTrain4, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain4, YTrain4, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain4, YTrain4, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain4, YTrain4, classifyRandomForestClassifier)
print score3

47.801578354
65.321307779
42.0293122886
82.8861330327


In [142]:
# Replace missing values (NaN) with row spectral mean
XTrain5, YTrain5, rawTestData5 = readInputFiles()
XTrain5 = replaceMissingValues(XTrain5,'rowspectralmean')
XTest5 = replaceMissingValues(rawTestData5,'rowspectralmean')
score = kFoldCrossVal(XTrain5, YTrain5, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain5, YTrain5, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain5, YTrain5, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain5, YTrain5, classifyRandomForestClassifier)
print score3

54.182638106
62.6606538895
49.5603156708
82.7733934611


In [143]:
# Replace missing values (NaN) with row spectral median
XTrain6, YTrain6, rawTestData6 = readInputFiles()
XTrain6 = replaceMissingValues(XTrain6,'rowspectralmedian')
XTest6 = replaceMissingValues(rawTestData6,'rowspectralmedian')
score = kFoldCrossVal(XTrain6, YTrain6, classifyOneVsRestClassifier)
print score
score1 = kFoldCrossVal(XTrain6, YTrain6, classifyOneVsOneClassifier)
print score1
score2 = kFoldCrossVal(XTrain6, YTrain6, classifyOutputCodeClassifier)
print score2
score3 = kFoldCrossVal(XTrain6, YTrain6, classifyRandomForestClassifier)
print score3

42.3449830891
72.4464487035
45.1409244645
82.7508455468


In [129]:
def writePrdictedLabelFile(YPred):
    f = open("Predictions.csv","w")
    f.write("Id,Prediction" + "\n")

    for i in xrange(len(YPred)):
        f.write(str(i+1) + "," + str(int(YPred[i]))+ "\n")
    
    f.close()

In [205]:
XTrain, YTrain, rawTestData = readInputFiles()
XTrain = replaceMissingValues(XTrain,'rowspectralmedian')
XTest = replaceMissingValues(rawTestData,'rowspectralmedian')
score = kFoldCrossVal(XTrain, YTrain, classifyRandomForestClassifier)
print score
#YPred = OneVsRestClassifier(LinearSVC(random_state=0)).fit(XTrain, YTrain).predict(XTest)
#YPred = OneVsOneClassifier(LinearSVC(random_state=0)).fit(XTrain, YTrain).predict(XTest)
YPred = RandomForestClassifier(n_estimators=100).fit(XTrain, YTrain).predict(XTest)
     
writePrdictedLabelFile(YPred)

82.5479143179
