
# Sistemas de Suporte à Decisão

---------------

# TPC 2


Crie um ficheiro em python para trabalhar o dataset 

```python 
    datasets.california_housing
```

Nesse ficheiro, crie um script (função) por alínea que lhe permita gerar novos datasets a partir do dataset principal, onde tenha usado cada um dos seguintes métodos de pre-processamento: 

    1) Aggregation
    2) Sampling
    4) Dimensionality Reduction 
    5) Feature Subset Selection 
    6) Feature Creation 
    7) Discretization and Binarization 
    8) Attribute Transformation


O que é feito em cada caso, é da sua inteira liberdade. 




In [0]:
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn import preprocessing

In [2]:
#import data to variable
data = datasets.california_housing.fetch_california_housing()

#criar uma tabela a partir da informação

dframe = pd.DataFrame(data = data.data, columns=data.feature_names)
dframe.head()


Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
#aggregate
agg = dframe.loc[dframe["AveRooms"]<4,:]

agg.aggregate(["sum", "mean", "max"])


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
sum,6865.3151,89960.0,9914.116674,3003.755845,4550410.0,10104.593373,99867.94,-341424.34
mean,2.393764,31.366806,3.456805,1.047335,1586.614,3.523219,34.821457,-119.046144
max,10.2264,52.0,3.998331,2.111111,11272.0,1243.333333,40.87,-114.65


In [4]:
#sampling
dframe.sample(frac = 0.1)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
17385,3.0723,6.0,6.269859,1.213275,2551.0,2.775843,34.98,-120.47
6599,3.1741,52.0,5.398190,1.031674,703.0,3.180995,34.17,-118.16
720,2.5350,37.0,4.723214,1.122768,1109.0,2.475446,37.68,-122.10
180,2.5238,49.0,4.192024,1.106352,1487.0,2.196455,37.80,-122.24
15582,2.4744,17.0,6.225854,1.405646,1802.0,2.677563,33.33,-116.74
10423,6.4870,4.0,8.587678,1.208531,661.0,3.132701,33.47,-117.62
7007,3.9688,41.0,5.259786,0.971530,916.0,3.259786,33.98,-118.07
6114,3.1597,37.0,4.237647,0.992941,1564.0,3.680000,34.13,-117.90
354,1.8426,40.0,4.926901,1.002924,949.0,2.774854,37.76,-122.17
17023,8.4693,26.0,6.993080,1.022491,1611.0,2.787197,37.52,-122.32


In [5]:
#dimensionality reduction
pca = PCA(n_components=2) # 2 dimensional PCA
pca.fit(dframe)
df_pca = pca.transform(dframe)
print(df_pca)

[[-1103.51264821     8.56663624]
 [  975.54124442    -4.67041774]
 [ -929.54990812    20.03465092]
 ...
 [ -418.43757543   -12.90825582]
 [ -684.43952537   -12.79458051]
 [  -38.43624736   -12.67037806]]


In [6]:
#Feature subset selection

age = dframe.loc[dframe["HouseAge"]<10,:]

age.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
59,2.5625,2.0,2.77193,0.754386,94.0,1.649123,37.82,-122.29
570,7.611,5.0,6.855776,1.061442,7427.0,2.732524,37.72,-122.24
577,7.0568,5.0,7.023438,0.912109,1738.0,3.394531,37.73,-122.06
631,3.663,5.0,5.158537,1.213415,814.0,2.481707,37.72,-122.17
676,5.4858,5.0,5.204489,1.014963,840.0,2.094763,37.68,-122.18


In [7]:
#Feature creation
new_data =np.concatenate((data.data, data.target.reshape(-1,1)*100000), axis=1)
names = data.feature_names + ["Preços"]
dframe = pd.DataFrame(data=new_data, columns=names)
dframe

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Preços
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,452600.0
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,358500.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,352100.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,341300.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,342200.0
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,269700.0
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25,299200.0
7,3.1200,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25,241400.0
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26,226700.0
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25,261100.0


In [8]:
#Discretization Binarization
binarizer = preprocessing.Binarizer().fit(dframe)
print(binarizer)
binarizer.threshold=3.5
dframe
binarizer.transform(dframe)[:10,:]

Binarizer(copy=True, threshold=0.0)


array([[1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [0., 1., 1., 0., 1., 0., 1., 0., 1.],
       [0., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.]])

In [9]:
#Feature Transformation
def my_func(x):
    return [x[0],x.name**2,x.name*x[1]]
  
dframe[:3]

dframe.apply(my_func, axis=1)

0                     [8.3252, 0, 0.0]
1                    [8.3014, 1, 21.0]
2                   [7.2574, 4, 104.0]
3                   [5.6431, 9, 156.0]
4                  [3.8462, 16, 208.0]
5                  [4.0368, 25, 260.0]
6                  [3.6591, 36, 312.0]
7                    [3.12, 49, 364.0]
8                  [2.0804, 64, 336.0]
9                  [3.6912, 81, 468.0]
10                [3.2031, 100, 520.0]
11                [3.2705, 121, 572.0]
12                 [3.075, 144, 624.0]
13                [2.6736, 169, 676.0]
14                [1.9167, 196, 728.0]
15                 [2.125, 225, 750.0]
16                 [2.775, 256, 832.0]
17                [2.1202, 289, 884.0]
18                [1.9911, 324, 900.0]
19                [2.6033, 361, 988.0]
20                [1.3578, 400, 800.0]
21                [1.7135, 441, 882.0]
22                [1.725, 484, 1144.0]
23               [2.1806, 529, 1196.0]
24                  [2.6, 576, 1248.0]
25               [2.4038,

In [10]:
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn import preprocessing

#import data to variable
data = datasets.california_housing.fetch_california_housing()

#criar uma tabela a partir da informação

#dframe = pd.DataFrame(data = data.data, columns=data.feature_names)
#dframe.head()




#aggregate
#agg = dframe.loc[dframe["AveRooms"]<4,:]

#agg.aggregate(["sum", "mean", "max"])

#sampling
#dframe.sample(frac = 0.1)

#dimensionality reduction
#pca = PCA(n_components=2) # 2 dimensional PCA
#pca.fit(dframe)
#df_pca = pca.transform(dframe)
#print(df_pca)

#Feature subset selection

#age = dframe.loc[dframe["HouseAge"]<10,:]

#age.head()

#Feature creation
new_data =np.concatenate((data.data, data.target.reshape(-1,1)*100000), axis=1)
names = data.feature_names + ["Preços"]
dframe = pd.DataFrame(data=new_data, columns=names)
dframe

#Discretization Binarization
binarizer = preprocessing.Binarizer().fit(dframe)
print(binarizer)
binarizer.threshold=3.5
dframe
binarizer.transform(dframe)[:10,:]





#Feature Transformation
#def my_func(x):
 #   return [x[0],x.name**2,x.name*x[1]]
  
#dframe[:3]

#dframe.apply(my_func, axis=1)

Binarizer(copy=True, threshold=0.0)


array([[1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.],
       [0., 1., 1., 0., 1., 0., 1., 0., 1.],
       [0., 1., 1., 0., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 1., 0., 1., 0., 1.]])