# Escalas PCA

## Modulos

In [1]:
import sys

In [2]:
# from .env
sys.path.append('../.env/lib/python3.6/site-packages/')

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.preprocessing import scale
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis

## Dados

In [5]:
dados_pca = pd.read_pickle('../data/dados_pca.pkl')

In [6]:
dados_pca.head()

Unnamed: 0,idSpss,Livro_1,Livro_2,Livro_3,Livro_4,Livro_5,Livro_6,Livro_7,Livro_8,Livro_9,...,LAT7,LAT8,LEC1,LEC2,LEC3,LEC4,LEC5,LEC6,LEC7,LEC8
0,1.0,3.0,5.0,2.0,2.0,4.0,5.0,4.0,3.0,4.0,...,2.0,4.0,2.0,3.0,4.0,4.0,4.0,4.0,1.0,1.0
1,2.0,6.0,6.0,7.0,7.0,5.0,6.0,6.0,7.0,6.0,...,,,,,,,,,,
2,3.0,7.0,3.0,1.0,1.0,7.0,1.0,5.0,5.0,1.0,...,7.0,7.0,5.0,5.0,7.0,6.0,5.0,7.0,7.0,7.0
3,4.0,7.0,7.0,7.0,7.0,,3.0,4.0,5.0,5.0,...,7.0,7.0,7.0,7.0,7.0,1.0,1.0,7.0,7.0,7.0
4,5.0,2.0,2.0,5.0,5.0,5.0,7.0,6.0,7.0,1.0,...,7.0,7.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,7.0


## Aplicar PCA

In [33]:
dimensoes = ['Livro','IDF','TRP','RCP','SST','PER','AVP','SAT','PAO','FAT','LAT','LEC']

In [34]:
for dim in dimensoes:
    
    # separar dimensoes
    df_pca = dados_pca.filter(regex = f"idSpss|{dim}")
    
    # filtrar missings em todas as linhas
    idx = df_pca.index[df_pca.drop(columns = 'idSpss').isnull().all(1)]
    df_pca = df_pca[~df_pca.index.isin(idx)]
    
    # ajustar os missings restantes por interpolação
    df_pca = df_pca.interpolate(limit_direction = 'both')
    
    # aplicar o algoritmo
    cols_pca = df_pca.drop(columns = 'idSpss').columns
    scaler = StandardScaler()
    fa = FactorAnalysis(n_components=1, rotation = 'varimax', svd_method = 'lapack')
    pca_pipeline = Pipeline(steps=[("scaler", scaler), ("fa", fa)])
    score_name = dim+'_score'
    df_pca[score_name] = pca_pipeline.fit_transform(df_pca[cols_pca])
    
    # Combinar com os dados totais
    dados_pca = dados_pca.merge(df_pca[['idSpss', score_name]], on = 'idSpss', how = 'left')

In [35]:
dados_pca_score = dados_pca.filter(regex = f"idSpss|_score")

In [36]:
dados_pca_score.head()

Unnamed: 0,idSpss,Livro_score,IDF_score,TRP_score,RCP_score,SST_score,PER_score,AVP_score,SAT_score,PAO_score,FAT_score,LAT_score,LEC_score
0,1.0,-1.179887,-2.276694,-1.403417,-1.299009,-0.312851,-0.965156,-0.688917,-0.333053,-1.957198,-0.941725,-1.980388,-0.349405
1,2.0,0.896125,0.411117,,,,,,,,,,
2,3.0,-1.217304,0.918679,0.373639,0.054938,0.448074,0.529748,0.917137,0.827672,1.471288,1.020613,1.004458,1.034081
3,4.0,0.620987,0.730947,1.177616,0.879094,0.817992,0.828329,1.057682,1.554314,0.65764,0.72316,0.936833,0.333726
4,5.0,0.075845,0.918679,0.373639,-0.199645,0.102032,-1.08543,0.516827,-1.12282,-0.336414,1.33312,0.85923,1.136139


In [18]:
# salvar
dados_pca_score.to_pickle('../data/dados_pca_score.pkl')

## Testes

- Uso do PCA

In [9]:
cols_pca = ['Livro','IDF','TRP','RCP','SST','PER','AVP','SAT','PAO','FAT','LAT','LEC']
cols_pca

['Livro',
 'IDF',
 'TRP',
 'RCP',
 'SST',
 'PER',
 'AVP',
 'SAT',
 'PAO',
 'FAT',
 'LAT',
 'LEC']

In [11]:
'|'.join(dimensoes)

'Livro|IDF|TRP|RCP|SST|PER|AVP|SAT|PAO|FAT|LAT|LEC'

In [12]:
f"idSpss|{dimensoes[1]}"

'idSpss|IDF'

In [13]:
teste = dados_pca.filter(regex = f"idSpss|{dimensoes[1]}")

In [14]:
idx = teste.index[teste.drop(columns = 'idSpss').isnull().all(1)]

In [15]:
teste2 = teste[~teste.index.isin(idx)]

In [16]:
teste2 = teste2.interpolate(limit_direction = 'both')

In [17]:
cols_pca = teste2.drop(columns = 'idSpss').columns
cols_pca

Index(['IDF1', 'IDF2', 'IDF3', 'IDF4', 'IDF5'], dtype='object')

In [18]:
dados_pca_normalizados = scale(teste2[cols_pca])

In [19]:
# instanciar o pca
numero_componentes = len(cols_pca)
#numero_componentes = 1
pca = PCA(n_components=numero_componentes)

In [20]:
# aplicar o algoritmo nos dados
transformacao_pca = pca.fit_transform(dados_pca_normalizados)

In [21]:
transformacao_pca

array([[ 4.94216549, -1.51501738,  0.34988665, -0.38214915, -0.2068512 ],
       [-0.70452163,  0.7490276 , -0.19853532,  0.35010311, -0.64544009],
       [-1.84929231, -0.32521683,  0.01220032, -0.25651482,  0.04491354],
       ...,
       [-1.84929231, -0.32521683,  0.01220032, -0.25651482,  0.04491354],
       [-1.84929231, -0.32521683,  0.01220032, -0.25651482,  0.04491354],
       [ 4.89829096,  0.03350355,  0.54766201, -0.37939272,  0.18232041]])

In [22]:
#transformacao_pca.transform(dados_pca_normalizados)

In [23]:
cargas = pca.components_.T * np.sqrt(pca.explained_variance_)

In [24]:
componentes_cols = ['PC'+str(i) for i in range(1, numero_componentes+1)]

In [25]:
df_cargas = pd.DataFrame(cargas, columns=componentes_cols, index=cols_pca)

In [26]:
# cargas grafico
df_cargas.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
IDF1,-0.92,-0.11,-0.017,0.12,-0.36
IDF2,-0.87,-0.25,0.056,-0.42,0.046
IDF3,-0.85,0.3,-0.43,-0.018,0.074
IDF4,-0.84,0.41,0.36,0.022,0.045
IDF5,-0.88,-0.32,0.033,0.29,0.21


In [100]:
#teste2['livro'] = transformacao_pca

In [101]:
#teste2

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [103]:
scaler = StandardScaler()
pca = PCA(n_components=1)

In [104]:
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca)])

In [105]:
pipe.fit(teste2[cols_pca])

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=1))])

In [106]:
pipe['scaler'].transform(teste2[cols_pca])

array([[-2.00497633, -1.33005024, -3.12806375, -2.88166359, -1.77589184],
       [ 0.83183411, -0.33385679,  0.6695895 ,  0.59064449, -0.1880295 ],
       [ 0.83183411,  1.1604334 ,  0.6695895 ,  0.59064449,  0.8705454 ],
       ...,
       [ 0.83183411,  1.1604334 ,  0.6695895 ,  0.59064449,  0.8705454 ],
       [ 0.83183411,  1.1604334 ,  0.6695895 ,  0.59064449,  0.8705454 ],
       [-2.57233842, -1.82814697, -2.49512154, -1.72422757, -2.30517929]])

In [107]:
pipe.fit_transform(teste2[cols_pca])

array([[ 4.94216549],
       [-0.70452163],
       [-1.84929231],
       ...,
       [-1.84929231],
       [-1.84929231],
       [ 4.89829096]])

- Uso do Factor Analysis

In [27]:
# instanciar a fa
numero_componentes = len(cols_pca)
#numero_componentes = 1
fa = FactorAnalysis(n_components=numero_componentes, 
                    rotation = 'varimax', 
                    svd_method = 'lapack')

In [28]:
# aplicar
transformacao_fa = fa.fit_transform(dados_pca_normalizados)

In [29]:
# cargas
cargas = fa.components_.T

In [30]:
componentes_cols = ['PC'+str(i) for i in range(1, numero_componentes+1)]

In [31]:
df_cargas = pd.DataFrame(cargas, columns=componentes_cols, index=cols_pca)

In [32]:
# cargas grafico
df_cargas.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
IDF1,0.87,0,0,0,0
IDF2,0.82,0,0,0,0
IDF3,0.8,0,0,0,0
IDF4,0.78,0,0,0,0
IDF5,0.83,0,0,0,0


In [114]:
scaler = StandardScaler()
fa = FactorAnalysis(n_components=1, 
                    rotation = 'varimax', 
                    svd_method = 'lapack')

In [115]:
pipe = Pipeline(steps=[("scaler", scaler), ("fa", fa)])

In [116]:
pipe.fit_transform(teste2[cols_pca])

array([[-2.27669363],
       [ 0.41111651],
       [ 0.91867929],
       ...,
       [ 0.91867929],
       [ 0.91867929],
       [-2.47525114]])

In [117]:
teste2['teste'] = pipe.fit_transform(teste2[cols_pca])

In [118]:
teste2

Unnamed: 0,idSpss,IDF1,IDF2,IDF3,IDF4,IDF5,teste
0,1.0,2.0,2.0,1.0,1.0,2.0,-2.276694
1,2.0,7.0,4.0,7.0,7.0,5.0,0.411117
2,3.0,7.0,7.0,7.0,7.0,7.0,0.918679
3,4.0,7.0,5.0,7.0,7.0,7.0,0.730947
4,5.0,7.0,7.0,7.0,7.0,7.0,0.918679
5,6.0,7.0,6.0,7.0,7.0,6.0,0.711831
6,7.0,7.0,7.0,7.0,7.0,7.0,0.918679
7,8.0,1.0,1.0,1.0,1.0,1.0,-2.716018
8,9.0,7.0,7.0,7.0,7.0,7.0,0.918679
9,10.0,6.0,5.0,7.0,5.0,6.0,0.236872
