# Principal Component Analysis (PCA) on EXPLORATION_NOTIF_CONSEIL

## Setup and dataset loading <a id="setup" /> 



In [287]:
%pylab inline
import dataiku                               # Access to Dataiku datasets
import pandas as pd, numpy as np             # Data manipulation 
from sklearn.decomposition import PCA        # The main algorithm
from matplotlib import pyplot as plt         # Graphing
import seaborn as sns                        # Graphing
from collections import defaultdict, Counter # Utils
sns.set(style="white")                       # Tuning the style of charts
import warnings                              # Disable some warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Populating the interactive namespace from numpy and matplotlib


In [288]:
dataset_limit = 10000
keep_dates = False

Load a DSS dataset as a Pandas dataframe

In [289]:
# Take a handle on the dataset
mydataset = dataiku.Dataset("EXPLORATION_NOTIF_CONSEIL")

# Load the first lines.
# You can also load random samples, limit yourself to some columns, or only load
# data matching some filters.
#
# Please refer to the Dataiku Python API documentation for more information
df = mydataset.get_dataframe(limit = dataset_limit)

df_orig = df.copy()

# Get the column names
numerical_columns = list(df.select_dtypes(include=[np.number]).columns)
categorical_columns = list(df.select_dtypes(include=[object]).columns)
date_columns = list(df.select_dtypes(include=['<M8[ns]']).columns)

# Print a quick summary of what we just loaded
print "Loaded dataset"
print "   Rows: %s" % df.shape[0]
print "   Columns: %s (%s num, %s cat, %s date)" % (df.shape[1], 
                                                    len(numerical_columns), len(categorical_columns),
                                                    len(date_columns))

Loaded dataset
   Rows: 176
   Columns: 21 (19 num, 2 cat, 0 date)


## Preprocessing of the data <a id="preprocessing" />

Keep the dates as features if requested by the user

In [290]:
columns_to_drop = ['marg428','marg572','marg524','marg381','marg621','cible_marg381','groupe','traj_hors_flux_texte','Annee','cible_marg380']

if keep_dates:
    df[date_columns] = df[date_columns].astype(int)*1e-9
else:
    columns_to_drop.extend(date_columns)

In [291]:
df.shape

(176, 21)

Get rid of the columns that contain too many unique values

In [292]:
DROP_LIMIT_ABS = 200
CAT_DROP_LIMIT_RATIO = 0.5
for feature in categorical_columns:
    nu = df[feature].nunique()
    
    if nu > DROP_LIMIT_ABS or nu > CAT_DROP_LIMIT_RATIO*df.shape[0]:
        print "Dropping feature %s with %s values" % (feature, nu)
        columns_to_drop.append(feature)

In [None]:
## Dropping the columns 
print "Dropping the following columns: %s" % columns_to_drop
df = df.drop(columns_to_drop, axis=1)

In [None]:
## Filling missing values
numerical_columns = list(df.select_dtypes(include=[np.number]).columns)
categorical_columns = list(df.select_dtypes(include=[object]).columns)
# Use mean for numerical features
for feature in numerical_columns:
    v = df[feature].mean()
    if np.isnan(v):
        v = 0
    print "Filling %s with %s" % (feature, v)
    df[feature] = df[feature].fillna(v)
    
# Use mode for categorical features
for feature in categorical_columns:
    v = df[feature].value_counts().index[0]
    df[feature] = df[feature].fillna(v)

In [None]:
# For categorical variables with more than that many values, we only keep the most frequent ones
LIMIT_DUMMIES = 100

# Only keep the top 100 values
def select_dummy_values(train, features):
    dummy_values = {}
    for feature in features:
        values = [
            value
            for (value, _) in Counter(train[feature]).most_common(LIMIT_DUMMIES)
        ]
        dummy_values[feature] = values
    return dummy_values

DUMMY_VALUES = select_dummy_values(df, [x for x in categorical_columns if not x in columns_to_drop])


def dummy_encode_dataframe(df):
    for (feature, dummy_values) in DUMMY_VALUES.items():
        for dummy_value in dummy_values:
            dummy_name = u'%s_value_%s' % (feature, dummy_value.decode('utf-8'))
            df[dummy_name] = (df[feature] == dummy_value).astype(float)
        del df[feature]
        print 'Dummy-encoded feature %s' % feature

dummy_encode_dataframe(df)

Finally, we rescale the whole data

In [None]:
df.set_index("CERFRANCE",inplace=True)
#df.drop(columns=["cible_marg380"])
df.head()
#df.shape

In [None]:
X = df
#print(X.shape)
print(X)

In [297]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(X)
X_std = ss.transform(X)

## Computation of the PCA <a id="pca" />

Let's "fit" the PCA algorithm (in other words, let's compute the singular value decomposition)

In [298]:
from sklearn.decomposition import PCA
acp = PCA(svd_solver='full')
print(acp)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)


In [299]:
Z=X_std
coord = acp.fit_transform(Z)

In [None]:
print(acp.n_components_)

In [None]:
### Eigen Values and Screenplot
#Explained Variance
print(acp.explained_variance_)

In [302]:
#valeur corrigée
n = X.shape[0]
p = X.shape[1]

In [None]:
## Les valeurs propres des 10 composantes
eigval=acp.singular_values_**2/n
print(acp.singular_values_**2/n)

In [None]:
#proportion de variance expliquée
print(acp.explained_variance_ratio_)

In [None]:
#scree plot
plt.plot(numpy.arange(1,p+1),eigval)
plt.title("Scree plot")
plt.ylabel("Eigen values")
plt.xlabel("Factor number")
plt.show()

In [None]:
#cumul de variance expliquée
plt.plot(numpy.arange(1,p+1),numpy.cumsum(acp.explained_variance_ratio_))
plt.title("Explained variance vs. # of factors")
plt.ylabel("Cumsum explained variance ratio")
plt.xlabel("Factor number")
plt.show()

In [307]:
## Détermination du nombre de composantes
# Nous allons nous intéressé aux 5 composantes qui permettent d'expliquer 90 pour cent de la variabilité. 

In [None]:
### Représentation des Cerfrances. utilisation des coordonnées factorielles. 
#positionnement des individus dans le premier plan
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-6,6) #même limites en abscisse
axes.set_ylim(-6,6) #et en ordonnée
#placement des étiquettes des observations
for i in range(n):
     plt.annotate(X.index[i],(coord[i,0],coord[i,1]))
#ajouter les axes
plt.plot([-6,6],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-6,6],color='silver',linestyle='-',linewidth=1)
#affichage
plt.show()

In [None]:
eigval=acp.singular_values_**2/n
print(acp.singular_values_**2/n)

In [None]:
#racine carrée des valeurs propres
sqrt_eigval = numpy.sqrt(eigval)
print(sqrt_eigval)

In [312]:
#corrélation des variables avec les axes
corvar = numpy.zeros((p,p))
for k in range(p):
     corvar[:,k] = acp.components_[k,:] * sqrt_eigval[k]

#afficher la matrice des corrélations variables x facteurs
#print(corvar)

In [None]:
# Table de corrélation des variables avec les axes
#on affiche pour les cinq premiers axes
print(pd.DataFrame({'id':X.columns,'COR_1':corvar[:,0],'COR_2':corvar[:,1],'COR_3':corvar[:,2],'COR_4':corvar[:,3],'COR_5':corvar[:,4],'COR_6':corvar[:,5],'COR_7':corvar[:,6]}))

In [None]:
#cercle des corrélations
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
#affichage des étiquettes (noms des variables)
for j in range(p):
     plt.annotate(X.columns[j],(corvar[j,0],corvar[j,1]))

#ajouter les axes
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)
#affichage
plt.show()

In [None]:
## La contribution des variables aux axes, on présente les 5 premiers axes. 
print(pd.DataFrame({'id':X.columns,'CTR_1':ctrvar[:,0],'CTR_2':ctrvar[:,1],'CTR_3':ctrvar[:,2],'CTR_4':ctrvar[:,3],'CTR_5':ctrvar[:,4]}))