# 1 - Data Reading

In [354]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#read from csv
test_df = pd.read_csv("sat-test-data.csv.dat", header=None)
train_df = pd.read_csv("sat-train.csv.dat", header =None)


# 2 - Global Variables

In [355]:
test_df
train_df
Y_train = train_df[len(train_df.columns)-1]
X_train = train_df.drop(len(train_df.columns)-1,axis=1)
num_rows_X_train = X_train[0].count()
num_columns_X_train = len(X_train.columns)

# 2 - Data Pre-Processing

## 2.1 - Missing Values

In [356]:
import  scipy.stats as stats
import numpy.ma as ma
import math

######################################
# Strategies based on constants
######################################

#replace by zero - black colour in RGB
def replaceByZero(df):
    return np.nan_to_num(df)

#replace by 255 - white colour in RGB
def replaceBy255(df):
    return df.fillna(255)

#####################################
# Strategies based on columns values
#####################################

#column minimum
def replaceByColumnMinimum(df):
    return df.fillna(df.min())

#column maximum
def replaceByColumnMaximum(df):
    return df.fillna(df.max())

#column mean
def replaceByColumnMean(df):
    return df.fillna(df.mean())

#column median
def replaceByColumnMedian(df):
    return df.fillna(df.median())

#####################################
# Strategies based on rows values
#####################################

# in the analysis of the rows we have to take into account that each four consecutive rows describe a pixel.
# Each of these four rows stands for: Red, Green, IR, IR. |R|G|IR1|IR2|
# These values refer to different things, and therefore are analysed independently.
# this will allow to consider more the specificities of the problem

# divide the data into four datasets, corresponding to each type of values 

#general function to gather columns for each type |R|G|IR1|IR2| using the mod operator
def separatePixelColumns(position, df):
    indexes = range(0, num_columns_X_train-1)
    indexes = [x for x in indexes if x % 4 == position]
    df_p_attribute = df.iloc[:,indexes]
    return df_p_attribute

#general function to fill missing values based on the rows
#Note that does not make sense to consider the four values |R|G|IR1|IR2|, because they refer to different properties
def fillMissingValuesByRow(df, function):
    for index, row in df.iterrows():
        value_without_nan = function(row)
        nan_positions = row.isnull()
        row[nan_positions] = value_without_nan
    return df

#row spectral mean
def replaceByRowMean(df):
    return fillMissingValues(df_p_attribute_R, np.nanmean)

#row spectral median
def replaceByRowMedian(df):
    return fillMissingValues(df_p_attribute_R, np.nanmedian)

#row spectral minimum
def replaceByRowMinimum(df):
    return fillMissingValues(df_p_attribute_R, np.nanmin)

#row spectral maximum
def replaceByRowMinimum(df):
    return fillMissingValues(df_p_attribute_R, np.nanmax)

########################################
# Strategies based on data distribution
########################################

#consider the distribution of the spectral values of each type. 
#Get random value from the spectral values of same type

    
def getRandomNumberFromDataframe(df):
    import random
    while True:
        row = df_p_attribute_R.sample(1)
        values = row.values[0]
        value = random.choice(values)
        if not math.isnan(value):
            break
    return value
        
def fillMissingValuesWithDistribution(df):
    for index, row in df_p_attribute_R.iterrows():
        nan_positions = row.isnull()
        for i in range(len(nan_positions)): 
            if nan_positions.iloc[i] == True:
                value = getRandomNumberFromDataframe(df)
                row.iloc[i] = value
    return df

test_df = replaceByRowMinimum(test_df)
X_train = replaceByRowMinimum(X_train)


In [337]:
#########################################
# Test fillMissingValuesWithDistribution
#########################################

#generate four new datasets with the columns of each type
df_p_attribute_R_train = separatePixelColumns(0, X_train)
df_p_attribute_G_train = separatePixelColumns(1, X_train)
df_p_attribute_IR1_train = separatePixelColumns(2, X_train)
df_p_attribute_IR2_train = separatePixelColumns(3, X_train)

df_p_attribute_R_test = separatePixelColumns(0, test_df)
df_p_attribute_G_test = separatePixelColumns(1, test_df)
df_p_attribute_IR1_test = separatePixelColumns(2, test_df)
df_p_attribute_IR2_test = separatePixelColumns(3, test_df)

df_p_attribute_R_train = fillMissingValuesWithDistribution(df_p_attribute_R_train)
df_p_attribute_G_train = fillMissingValuesWithDistribution(df_p_attribute_G_train)
df_p_attribute_IR1_train = fillMissingValuesWithDistribution(df_p_attribute_IR1_train)
df_p_attribute_IR2_train = fillMissingValuesWithDistribution(df_p_attribute_IR2_train)

df_p_attribute_R_test = fillMissingValuesWithDistribution(df_p_attribute_R_test)
df_p_attribute_G_test = fillMissingValuesWithDistribution(df_p_attribute_G_test)
df_p_attribute_IR1_test = fillMissingValuesWithDistribution(df_p_attribute_IR1_test)
df_p_attribute_IR2_test = fillMissingValuesWithDistribution(df_p_attribute_IR2_test)

df_p_attribute_R_train

Unnamed: 0,0,4,8,12,16,20,24,28,32
0,92.0,84.0,84.0,101.0,92.0,84.0,102.0,88.0,84.0
1,84.0,84.0,80.0,92.0,84.0,84.0,88.0,84.0,84.0
2,84.0,,84.0,84.0,,,84.0,84.0,84.0
3,80.0,84.0,80.0,84.0,84.0,76.0,84.0,84.0,
4,84.0,80.0,80.0,84.0,76.0,76.0,84.0,84.0,79.0
5,80.0,80.0,76.0,76.0,76.0,76.0,84.0,79.0,79.0
6,76.0,76.0,,,80.0,80.0,79.0,79.0,
7,76.0,,76.0,80.0,,80.0,79.0,,79.0
8,,,76.0,,76.0,76.0,75.0,75.0,
9,,76.0,,76.0,76.0,76.0,,79.0,79.0


## 2.2 Normalization

There is no need of normalization

# 3 - Choose Model

Nota: adicionar neural networks - new in v 0.18

First goal: discover which type of analyses works better
Second Goal: tune the model

In [357]:
#inspired in http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

import pandas
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# prepare data

#Y_train = train_df[:,-1]
#X_train = train_df[:,:-1]

#Y_train = train_df[len(train_df.columns)-1]
#X_train = train_df.drop(len(train_df.columns)-1,axis=1)

# prepare configuration for cross validation test harness
num_folds = 10
num_instances = len(X_train)
seed = 7

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('NN', MLPClassifier(alpha=1))) 
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)))
models.append(('AB', AdaBoostClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy' # try with 'roc_auc', f1'

for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [2000, 4435]

# Submission in Kaggle

In [358]:
# Logistic Regression

adaboost = AdaBoostClassifier()

adaboost.fit(X_train, Y_train)

Y_pred = adaboost.predict(test_df)

adaboost.score(X_train, Y_train)

print(np.arange(Y_pred.size))

Y_pred = Y_pred.astype(int)
#np.hstack((Y_pred, np.vstack(np.arange(Y_pred.size))))

print(Y_pred)
np.savetxt("Pauliguel.csv", Y_pred, delimiter=",", fmt='%u')

ValueError: Found input variables with inconsistent numbers of samples: [2000, 4435]

# Pipeline

- Poderíamos tentar fazer alguma coisa a ver com anomaly, porque até agora é uma análise muito geral:
-- Pedir frequências das duas classes.
-- Pedir um gráfico com as distribuições dos valores para cada espectro.
-- Ver resultado quando se lida com os Missing values pela estratégia da distribuição
-- ter o cuidado de ver se os resultados não estão a ser influenciados porque o nosso algoritmo diz sempre a mesma classe: ver precision e recall

In [34]:
#ReadData
#XTrain, YTrain, rawTestData = importDataFromFiles()

#handle missing values

#for each different strategy for the missing values check which is the best model in the, get values for accuracy, auc, F1
#check both on the K fold, and on the test dataset

#pick the best strategy for missing values and  the best model and submit to kaggle 
