# Preprocess data
Dataset: satimage<br>
By: Sam<br>
Update: 02/06/22

### About Dataset
NUMBER OF ATTRIBUTES: 36 (= 4 spectral bands x 9 pixels in neighbourhood) the pixels read out in sequence left-to-right and top-to-bottom. 

    - A1-A4: 4 top-left
    - A5-A8: 4 top middle
    - A9-A12: 4 top-right
    => central pixel are given by attributes 17,18,19 and 20

NUMBER OF EXAMPLES:

	- training set     4435
	- test set         2000
    
ATTRIBUTES: The attributes are numerical, in the range 0 to 255.
CLASS: 
	There are 6 decision classes: 1,2,3,4,5 and 7.

!!! NB. There are no examples with class 6 in this dataset-they have all been removed because of doubts about the 
	validity of this class.
    
!!! NB. DO NOT USE CROSS-VALIDATION WITH THIS DATASET !!!
- Just train and test only once with the above training and test sets.
- The data is given in random order and certain lines of data have been removed so you cannot reconstruct the original image from this dataset.

In [31]:
# Import library
import pandas as pd
import numpy as np

In [32]:
# Read data
data0_trn = pd.read_csv('sat_trn.csv', header=None, delim_whitespace=True) #training raw data
data0_tst = pd.read_csv('sat_tst.csv', header=None, delim_whitespace=True) #testing raw data

In [33]:
data0_trn.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,3
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,3
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,3
3,80,102,102,79,84,94,102,79,80,94,...,79,84,99,104,79,84,103,104,79,3
4,84,94,102,79,80,94,98,76,80,102,...,79,84,103,104,79,79,107,109,87,3


In [34]:
col0 = pd.read_csv('sat_col.csv', header=None, squeeze=True) #import column name

In [35]:
data0_trn.columns = col0 # change column name
data0_tst.columns = col0

In [36]:
data0_trn['class'] = pd.Categorical(data0_trn['class']) # convert class to categorical data
data0_tst['class'] = pd.Categorical(data0_tst['class'])

In [37]:
data0_trn.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A28,A29,A30,A31,A32,A33,A34,A35,A36,class
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,3
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,3
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,3
3,80,102,102,79,84,94,102,79,80,94,...,79,84,99,104,79,84,103,104,79,3
4,84,94,102,79,80,94,98,76,80,102,...,79,84,103,104,79,79,107,109,87,3


In [38]:
# Check number of unique values
data0_trn.select_dtypes(include=np.number).nunique()


0
A1      50
A2      81
A3      74
A4     101
A5      49
A6      81
A7      74
A8     100
A9      50
A10     80
A11     77
A12    102
A13     49
A14     82
A15     75
A16     98
A17     49
A18     79
A19     72
A20     99
A21     50
A22     79
A23     75
A24    101
A25     50
A26     82
A27     74
A28     97
A29     50
A30     80
A31     76
A32    101
A33     49
A34     79
A35     77
A36    104
dtype: int64

In [39]:
# Check number of unique values
data0_tst.select_dtypes(include=np.number).nunique()

0
A1     48
A2     78
A3     73
A4     96
A5     49
A6     75
A7     73
A8     96
A9     48
A10    73
A11    71
A12    94
A13    51
A14    76
A15    74
A16    96
A17    49
A18    76
A19    75
A20    97
A21    48
A22    73
A23    74
A24    95
A25    49
A26    74
A27    72
A28    96
A29    48
A30    76
A31    73
A32    94
A33    49
A34    75
A35    71
A36    92
dtype: int64

In [47]:
data0 = pd.concat([data0_trn, data0_tst])

In [48]:
data0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6435 entries, 0 to 1999
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

In [49]:
data0_trn.tail()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A28,A29,A30,A31,A32,A33,A34,A35,A36,class
4430,56,64,108,96,64,71,108,96,68,75,...,92,66,83,108,96,66,87,104,89,5
4431,64,71,108,96,68,75,108,96,71,87,...,96,66,87,104,89,63,87,104,89,5
4432,68,75,108,96,71,87,108,88,71,91,...,89,63,87,104,89,70,100,104,85,4
4433,71,87,108,88,71,91,100,81,76,95,...,89,70,100,104,85,70,91,104,85,4
4434,71,91,100,81,76,95,108,88,80,95,...,85,70,91,104,85,63,91,100,81,4


In [51]:
data0.iloc[4435]==data0_tst.iloc[0] # Check: first item of tesing dataset in the merged dataset

0
A1       True
A2       True
A3       True
A4       True
A5       True
A6       True
A7       True
A8       True
A9       True
A10      True
A11      True
A12      True
A13      True
A14      True
A15      True
A16      True
A17      True
A18      True
A19      True
A20      True
A21      True
A22      True
A23      True
A24      True
A25      True
A26      True
A27      True
A28      True
A29      True
A30      True
A31      True
A32      True
A33      True
A34      True
A35      True
A36      True
class    True
Name: 0, dtype: bool

In [52]:
# Export this dataset for discretization
data0.to_csv('clean_satimage.csv',index=False)