
# Sistemas de Suporte à Decisão

---------------

# TPC 3


Crie um ficheiro em python para trabalhar o dataset 

```python 
    datasets.california_housing
```

Nesse ficheiro, crie um script (função) por alínea que lhe permita gerar novos datasets a partir do dataset principal, onde tenha usado cada um dos seguintes métodos de pre-processamento: 

    1) Aggregation
    2) Sampling
    4) Dimensionality Reduction 
    5) Feature Subset Selection 
    6) Feature Creation 
    7) Discretization and Binarization 
    8) Attribute Transformation


O que é feito em cada caso, é da sua inteira liberdade. 




# Setup

### *Com 6) Feature Creation*

In [2]:
#Importing modules
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Set a variable for the data
data = datasets.california_housing.fetch_california_housing()

#Concatenar as features com os targets para termos mais valores
newdata = np.concatenate((data.data, data.target.reshape(-1,1)*100000), axis=1)
names = data.feature_names + ["Preços"]

#Array com nomes das features
featArr = ["MedInc","HouseAge","AveRooms","AveBedrms","Population","Latitude","Longitude","Preços"]

#Criar uma DFrame com os dados
dFrame = pd.DataFrame(data=newdata, columns=names)

"Imports done!"

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


'Imports done!'

## 1) Aggregation

In [3]:
key = 7
avgValue = np.mean(dFrame.iloc[:,[key+1]])
print("Average value of " + featArr[key] + " is " + str(round(avgValue[0])))

Average value of Preços is 206856.0


## 2) Sampling

In [4]:
sample_data = dFrame.sample(3)
print("3 Random entries of our dataset:")
sample_data.head()

3 Random entries of our dataset:


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Preços
13899,1.0568,16.0,7.842767,1.981132,433.0,2.72327,34.52,-116.73,75000.0
12520,1.1458,48.0,3.247024,1.199405,926.0,2.755952,38.55,-121.47,65400.0
18370,5.7415,37.0,6.424581,1.134078,511.0,2.854749,37.13,-121.93,398500.0


## 4) Dimensionality reduction

In [5]:
print("4) Dimensionality Reduction:")
#Import PCA logistics
from sklearn.decomposition import PCA
    
pca = PCA(2)

reducedData = pca.fit_transform(dFrame)

reducedDFrame = pd.DataFrame(reducedData)
reducedDFrame.index = dFrame.index
reducedDFrame.columns = ['PC1','PC2']
reducedDFrame.head()

4) Dimensionality Reduction:


Unnamed: 0,PC1,PC2
0,245744.443022,-1044.050516
1,151643.942588,1012.233962
2,145244.404001,-894.405072
3,134444.389293,-835.018505
4,135344.387555,-827.801144


## 5) Feature Subset Selection

In [17]:
dFrame.iloc[:,3]


0        1.023810
1        0.971880
2        1.073446
3        1.073059
4        1.081081
5        1.103627
6        0.951362
7        1.061824
8        1.117647
9        0.990196
10       1.079602
11       1.024523
12       1.012821
13       1.097701
14       1.009677
15       1.071970
16       1.048338
17       0.966997
18       1.085919
19       1.083636
20       1.108434
21       1.002732
22       1.131799
23       1.036923
24       1.035545
25       1.033613
26       1.020921
27       1.060453
28       1.040169
29       1.032258
           ...   
20610    1.102506
20611    1.078240
20612    1.082397
20613    0.950521
20614    0.970464
20615    1.096447
20616    1.176000
20617    0.956140
20618    1.092920
20619    1.148649
20620    0.854167
20621    1.573248
20622    0.936306
20623    1.074890
20624    1.030928
20625    1.214286
20626    1.076023
20627    1.101695
20628    1.079487
20629    1.020902
20630    1.134831
20631    1.141204
20632    1.080519
20633    1.078534
20634    1

## 7) Discretization and Binarization

In [22]:
#Identificar a ordem de entrada dos valores na nova frame
order = ["MedInc","HouseAge","AveRooms","AveBedrms","Population","Latitude","Longitude","Preços"]

pd.concat([dFrame, pd.get_dummies(dFrame, '', '').astype(bool)], axis=1)[order]

Unnamed: 0,MedInc,MedInc.1,HouseAge,HouseAge.1,AveRooms,AveRooms.1,AveBedrms,AveBedrms.1,Population,Population.1,Latitude,Latitude.1,Longitude,Longitude.1,Preços,Preços.1
0,8.3252,True,41.0,True,6.984127,True,1.023810,True,322.0,True,37.88,True,-122.23,True,452600.0,True
1,8.3014,True,21.0,True,6.238137,True,0.971880,True,2401.0,True,37.86,True,-122.22,True,358500.0,True
2,7.2574,True,52.0,True,8.288136,True,1.073446,True,496.0,True,37.85,True,-122.24,True,352100.0,True
3,5.6431,True,52.0,True,5.817352,True,1.073059,True,558.0,True,37.85,True,-122.25,True,341300.0,True
4,3.8462,True,52.0,True,6.281853,True,1.081081,True,565.0,True,37.85,True,-122.25,True,342200.0,True
5,4.0368,True,52.0,True,4.761658,True,1.103627,True,413.0,True,37.85,True,-122.25,True,269700.0,True
6,3.6591,True,52.0,True,4.931907,True,0.951362,True,1094.0,True,37.84,True,-122.25,True,299200.0,True
7,3.1200,True,52.0,True,4.797527,True,1.061824,True,1157.0,True,37.84,True,-122.25,True,241400.0,True
8,2.0804,True,42.0,True,4.294118,True,1.117647,True,1206.0,True,37.84,True,-122.26,True,226700.0,True
9,3.6912,True,52.0,True,4.970588,True,0.990196,True,1551.0,True,37.84,True,-122.25,True,261100.0,True


## 8) Attribute Transformation

In [35]:
dFrame.transform(lambda x: x/x.mean())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Preços
0,2.150842,1.431590,1.286448,0.933558,0.225889,0.832251,1.063093,1.022249,2.187997
1,2.144693,0.733253,1.149040,0.886206,1.684349,0.687098,1.062532,1.022165,1.733091
2,1.874972,1.815675,1.526641,0.978819,0.347954,0.912593,1.062252,1.022333,1.702152
3,1.457913,1.815675,1.071533,0.978466,0.391448,0.829772,1.062252,1.022416,1.649942
4,0.993678,1.815675,1.157092,0.985781,0.396359,0.710424,1.062252,1.022416,1.654292
5,1.042920,1.815675,0.877078,1.006339,0.289728,0.696886,1.062252,1.022416,1.303807
6,0.945340,1.815675,0.908437,0.867497,0.767463,0.693144,1.061971,1.022416,1.446418
7,0.806062,1.815675,0.883685,0.968221,0.811658,0.582369,1.061971,1.022416,1.166996
8,0.537478,1.466507,0.790959,1.019123,0.846033,0.660084,1.061971,1.022500,1.095932
9,0.953633,1.815675,0.915562,0.902907,1.088057,0.707428,1.061971,1.022416,1.262232
