
# Sistemas de Suporte à Decisão

---------------

# TPC 3


Crie um ficheiro em python para trabalhar o dataset 

```python 
    datasets.california_housing
```

Nesse ficheiro, crie um script (função) por alínea que lhe permita gerar novos datasets a partir do dataset principal, onde tenha usado cada um dos seguintes métodos de pre-processamento: 

    1) Aggregation
    2) Sampling
    4) Dimensionality Reduction 
    5) Feature Subset Selection 
    6) Feature Creation 
    7) Discretization and Binarization 
    8) Attribute Transformation


O que é feito em cada caso, é da sua inteira liberdade. 




In [0]:
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Dataset and DataFrame
data = datasets.california_housing.fetch_california_housing()
dframe = pd.DataFrame(data = data.data, columns=data.feature_names)

dframe.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [0]:
#Aggregation

agg = dframe.agg(["mean"])
agg

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704


In [0]:
#Sampling
samp = dframe.sample(3)
samp

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
6182,3.2813,28.0,5.144847,1.005571,1999.0,2.784123,34.1,-117.91
17590,4.7287,46.0,5.172881,0.994915,1373.0,2.327119,37.31,-121.91
13406,3.5488,8.0,4.768219,0.974696,3310.0,3.350202,34.09,-117.46


In [0]:
#Dimensionality Reduction

from sklearn.decomposition import PCA
pca = PCA(3)
pca.fit(dframe)
columns = ['pca_%i' % i for i in range(3)]
df_pca = pd.DataFrame(pca.transform(dframe), columns=columns, index=dframe.index)
df_pca.head()

Unnamed: 0,pca_0,pca_1,pca_2
0,-1103.512648,8.566636,-0.774104
1,975.541244,-4.670418,-1.025681
2,-929.549908,20.034651,-1.972737
3,-867.550048,20.331441,-2.314929
4,-860.550411,20.328827,-2.695158


In [0]:
#Feature Subset Selection

fss = dframe[['AveRooms','AveBedrms','AveOccup', 'HouseAge']][0:6]
fss

Unnamed: 0,AveRooms,AveBedrms,AveOccup,HouseAge
0,6.984127,1.02381,2.555556,41.0
1,6.238137,0.97188,2.109842,21.0
2,8.288136,1.073446,2.80226,52.0
3,5.817352,1.073059,2.547945,52.0
4,6.281853,1.081081,2.181467,52.0
5,4.761658,1.103627,2.139896,52.0


In [0]:
#Feature Creation

new_data = np.concatenate((data.data, data.target.reshape(-1,1)*100000), axis=1)
names = data.feature_names + ["Preços"]
dframe = pd.DataFrame(data=new_data, columns=names)
dframe.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Preços
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,452600.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,358500.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,352100.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,341300.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,342200.0


In [0]:
#Discretization and Binarization

bin = dframe.Population[0:10]
pd.get_dummies(bin)

Unnamed: 0,322.0,413.0,496.0,558.0,565.0,1094.0,1157.0,1206.0,1551.0,2401.0
0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,1,0


In [0]:
#Attribute Transformation

dframe.transform(lambda x: x/x.mean())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Preços
0,2.150842,1.431590,1.286448,0.933558,0.225889,0.832251,1.063093,1.022249,2.187997
1,2.144693,0.733253,1.149040,0.886206,1.684349,0.687098,1.062532,1.022165,1.733091
2,1.874972,1.815675,1.526641,0.978819,0.347954,0.912593,1.062252,1.022333,1.702152
3,1.457913,1.815675,1.071533,0.978466,0.391448,0.829772,1.062252,1.022416,1.649942
4,0.993678,1.815675,1.157092,0.985781,0.396359,0.710424,1.062252,1.022416,1.654292
5,1.042920,1.815675,0.877078,1.006339,0.289728,0.696886,1.062252,1.022416,1.303807
6,0.945340,1.815675,0.908437,0.867497,0.767463,0.693144,1.061971,1.022416,1.446418
7,0.806062,1.815675,0.883685,0.968221,0.811658,0.582369,1.061971,1.022416,1.166996
8,0.537478,1.466507,0.790959,1.019123,0.846033,0.660084,1.061971,1.022500,1.095932
9,0.953633,1.815675,0.915562,0.902907,1.088057,0.707428,1.061971,1.022416,1.262232
