# 1 - Data Reading

In [305]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#read from csv
test_df = pd.read_csv("sat-test-data.csv.dat", header=None)
train_df = pd.read_csv("sat-train.csv.dat", header =None)


# 2 - Global Variables

In [306]:
test_df
train_df
Y_train = train_df[len(train_df.columns)-1]
X_train = train_df.drop(len(train_df.columns)-1,axis=1)
num_rows_X_train = X_train[0].count()
num_columns_X_train = len(X_train.columns)

# 2 - Data Pre-Processing

## 2.1 - Missing Values

In [307]:
import  scipy.stats as stats
import numpy.ma as ma
import math

######replace by zero - black colour in RGB
#train_df = np.nan_to_num(train_df)
#test_df = np.nan_to_num(test_df)

######replace by 255 - white colour in RGB
#train_df = train_df.fillna(255)
#test_df = train_df.fillna(255)

######column minimum
#train_df = train_df.fillna(train_df.min())
#test_df = test_df.fillna(test_df.min())

######column maximum
#train_df = train_df.fillna(train_df.max())
#test_df = test_df.fillna(test_df.max())


#column mean
#train_df = train_df.fillna(train_df.mean())
#test_df = test_df.fillna(test_df.mean())

#column median 
#train_df = train_df.fillna(train_df.median())
#test_df = test_df.fillna(test_df.median())

# in the analysis of the rows we have to take into account that each four consecutive rows describe a pixel.
# Each of these four rows stands for: Green, R, IR, IR. These values refer to different things, and are analysed independently because of that.



#make four datasets. dataframe is a group of 9 pixels times 4 values for each pixel: |R|G|IR1|IR2
#this will allow to consider more the specificities of the problem

#make list with the indexes of all R
def separatePixelColumns( position ):
    indexes = range(0, num_columns_X_train-1)
    indexes = [x for x in indexes if x % 4 == position]
    df_p_attribute = test_df.iloc[:,indexes]
    return df_p_attribute

df_p_attribute_R = separatePixelColumns(0)
df_p_attribute_G = separatePixelColumns(1)
df_p_attribute_IR1 = separatePixelColumns(2)
df_p_attribute_IR2 = separatePixelColumns(3)

def fillMissingValuesByRow(df, function):
    for index, row in df.iterrows():
        value_without_nan = function(row)
        nan_positions = row.isnull()
        row[nan_positions] = value_without_nan
    return df

#for index, row in df_p_attribute_R.iterrows():
#        value_without_nan = np.nanmean(row)
#        nan_positions = row.isnull()
#        row[nan_positions] = value_without_nan

#row spectral mean

#df_p_attribute_R = fillMissingValues(df_p_attribute_R, np.nanmean)

#row spectral median

#df_p_attribute_R = fillMissingValues(df_p_attribute_R, np.nanmedian)
#row spectral minimum

#df_p_attribute_R = fillMissingValues(df_p_attribute_R, np.nanmin)
#row spectral maximum

#df_p_attribute_R = fillMissingValues(df_p_attribute_R, np.nanmax)

# distribution of the spectral values. Get random value from this sample

    
def getRandomNumberFromDataframe(df):
    import random

    while True:
        row = df_p_attribute_R.sample(1)
        values = row.values[0]
        value = random.choice(values)
        if not math.isnan(value):
            break
    return value
        
def fillMissingValuesWithDistribution(df):
    
    return df


print(df_p_attribute_R.head())

for index, row in df_p_attribute_R.iterrows():
    nan_positions = row.isnull()
    for i in range(len(nan_positions)): 
        if nan_positions.iloc[i] == True:
            value = getRandomNumberFromDataframe(df_p_attribute_R)
            row.iloc[i] = value
            

print(df_p_attribute_R.head())


     0     4     8     12    16    20    24    28    32
0  80.0  76.0  76.0  76.0  76.0   NaN  79.0  79.0   NaN
1  76.0  76.0  76.0   NaN  80.0  80.0  79.0   NaN  79.0
2  80.0  76.0   NaN  80.0  80.0  80.0  79.0  79.0   NaN
3  76.0  76.0  76.0  80.0  80.0  80.0  79.0  79.0  79.0
4  76.0  76.0  76.0  80.0  80.0  80.0  79.0  79.0  75.0
     0     4     8     12    16    20    24    28    32
0  80.0  76.0  76.0  76.0  76.0  78.0  79.0  79.0  63.0
1  76.0  76.0  76.0  70.0  80.0  80.0  79.0  56.0  79.0
2  80.0  76.0  52.0  80.0  80.0  80.0  79.0  79.0  74.0
3  76.0  76.0  76.0  80.0  80.0  80.0  79.0  79.0  79.0
4  76.0  76.0  76.0  80.0  80.0  80.0  79.0  79.0  75.0


## 2.2 Normalization

There is no need of normalization

# 3 - Choose Model

Nota: adicionar neural networks - new in v 0.18

First goal: discover which type of analyses works better
Second Goal: tune the model

In [35]:
#inspired in http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

import pandas
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# prepare data

Y_train = train_df[:,-1]
X_train = train_df[:,:-1]

#Y_train = train_df[len(train_df.columns)-1]
#X_train = train_df.drop(len(train_df.columns)-1,axis=1)

# prepare configuration for cross validation test harness
num_folds = 10
num_instances = len(X_train)
seed = 7

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('NN', MLPClassifier(alpha=1))) 
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)))
models.append(('AB', AdaBoostClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy' # try with 'roc_auc', f1'

for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

TypeError: unhashable type: 'slice'

# Submission in Kaggle

In [7]:
# Logistic Regression

adaboost = AdaBoostClassifier()

adaboost.fit(X_train, Y_train)

Y_pred = adaboost.predict(test_df)

adaboost.score(X_train, Y_train)

print(np.arange(Y_pred.size))

Y_pred = Y_pred.astype(int)
#np.hstack((Y_pred, np.vstack(np.arange(Y_pred.size))))

print(Y_pred)
np.savetxt("Pauliguel.csv", Y_pred, delimiter=",", fmt='%u')

[   0    1    2 ..., 1997 1998 1999]
[0 0 0 ..., 0 0 0]


# Pipeline

- Poderíamos tentar fazer alguma coisa a ver com anomaly, porque até agora é uma análise muito geral:
-- Pedir frequências das duas classes.
-- Pedir um gráfico com as distribuições dos valores para cada espectro.
-- Ver resultado quando se lida com os Missing values pela estratégia da distribuição
-- ter o cuidado de ver se os resultados não estão a ser influenciados porque o nosso algoritmo diz sempre a mesma classe: ver precision e recall

In [34]:
#ReadData
#XTrain, YTrain, rawTestData = importDataFromFiles()

#handle missing values

#for each different strategy for the missing values check which is the best model in the, get values for accuracy, auc, F1
#check both on the K fold, and on the test dataset

#pick the best strategy for missing values and  the best model and submit to kaggle 
