# Analyse de données - Projet Velib en R <a href="https://cran.r-project.org/"><img src="https://cran.r-project.org/Rlogo.svg" style="max-width: 40px; display: inline" alt="R"/></a>
---

_Etudiantes (<small>INSA Toulouse</small>) :_ Nina Moser, Cassandra Jan, Illana Rabasquinho, Manon Lacave-Pistaa.



# Introduction

In [None]:
install.packages("ggplot2")
install.packages("reshape2")
install.packages("gridExtra")
install.packages("psych")

library(ggplot2)
library(reshape2)
library(gridExtra)
library(psych)

In [None]:
install.packages('FactoMineR')
install.packages('factoextra')
install.packages('mclust')

library(FactoMineR)
library(factoextra)
library(ggpubr)
library(cluster)
library(mclust)

In [None]:
install.packages('cvms')
library(cvms)

In [None]:
library(tidyverse)

Avant de commencer le projet, nous insérons une graine aléatoire pour obtenir les mêmes résultats d'un notebook à l'autre

In [None]:
set.seed(1)

In [None]:
load('velib.RData')
summary(velib)

In [None]:
# data preparation
loading = as.matrix(velib$data)
colnames(loading) = 1:ncol(loading)
rownames(loading) = velib$names

stations = 1:nrow(loading)
coord = velib$position[stations,]
coord$bonus = velib$bonus[stations]

# select exactly 7 days of data (we remove the first 13 dates)
dates = 14:181
loading = loading[stations, dates]
colnames(loading) = 1:length(dates)

head(loading)
head(coord)

In [None]:
describe(loading)

# Analyse exploratoire

In [None]:
loading_missing_value <- sort(colSums(is.na(loading)), decreasing = TRUE)

cat('Nombre de valeur(s) manquante(s) dans le jeu de données Loading :\n')
cat(sum(loading_missing_value), '\n\n')

In [None]:
cat('Nombre de valeur(s) dupliquée(s) dans Loading :\n')
cat(sum(duplicated(loading)), '\n\n')

cat('Nombre de valeur(s) dupliquée(s) dans Coord :\n')
cat(sum(duplicated(coord)))

In [None]:
station_counts <- table(rownames(loading))

station_names <- sort(station_counts, decreasing = TRUE)

print(station_names)

# Evolution de la densité de Vélib' (loading) des stations

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)

# --- #

time_tick = 1 + 24*(0:6)  # vector corresponding to the beginning of days

# select a station
i = 2

df = melt(loading[i,])  #the function melt reshapes it from wide to long
df$time_range = 1:ncol(loading)

ggplot(df, aes(x=time_range, y=value)) + geom_line(col="darkorchid") +
    geom_vline(xintercept=time_tick, col="orange", linetype="dashed") +
    labs(title=velib$names[i]) + xlab("Time") + ylab("Loading")

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)

# --- #

timeTick = 1 + 24*(0:6)  # vector corresponding to the beginning of days

# select 16 stations
stations = seq(1,16)

df = melt(loading[stations,])  #the function melt reshapes it from wide to long

p = list()
for (i in 1:16){
    dfi = df[df$Var1 == velib$names[stations[i]],]
    p[[i]] = ggplot(dfi, aes(x=Var2, y=value)) +
        geom_line(col="darkorchid") +
        geom_vline(xintercept=timeTick, col="orange", linetype="dashed") +
        labs(title=velib$names[stations[i]]) + xlab("Time") + ylab("Loading")
}
do.call(grid.arrange,p)

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)

time_tick = 1 + 24*(0:6)

boxplot(loading, ylab="Loading", xlab="Time", widths=0.75, col="skyblue", main="Boxplots représentant le chargement des stations en fonction des heures de la semaine")
abline(v=time_tick, col="red")

In [None]:
# Calcul de la moyenne par heure par jour et transformation en tableau numpy
mean_per_hour_per_day <- colMeans(loading)
mean_per_hour_per_day <- matrix(mean_per_hour_per_day, nrow = 24, ncol = 7)

# Calcul de la moyenne des 7 jours de la semaine, heure par heure
mean_per_hour <- rowMeans(mean_per_hour_per_day)

# Jours de la semaine
days <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

# Tracer le graphique
matplot(mean_per_hour_per_day, type = 'l', lty = 1, col = 1:7)
lines(mean_per_hour, lwd = 3, col = 'black')

# Ajouter une légende
legend("topright", legend = c(days, "Moyenne par heure"), col = c(1:7, "black"), lwd = c(rep(1, 7), 3))

# Ajouter un titre
title("Moyenne du chargement horaire de toutes les stations")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)

df = data.frame(stations = c(1:nrow(loading)), mean = rowMeans(loading))
ggplot(df, aes(x = stations, y= mean)) +
    geom_line(color = 'cornflowerblue', linewidth=1) +
    geom_hline(yintercept = mean(loading), color = 'darkorange', linewidth=2) +
    labs(x = "Stations", y = "Average loading")

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
hours = c(6, 20, 23)
hours2 = c(10, 14, 18)

par(mfrow = c(2, 3))

dfi = coord
p1 = list()
for (i in 1:length(hours)){
    dfi$loading = loading[,hours[i]]
    p1[[i]] = ggplot(dfi, aes(x=longitude, y=latitude, color=loading)) +
        geom_point() +
        scale_color_gradient(low = "yellow", high = "blue") +  # Modification des couleurs
        labs(title = paste("Stations loading - Monday",hours[i],"h"))
}

p2 = list()
for (i in 1:length(hours2)){
    dfi$loading = loading[,hours2[i]]
    p2[[i]] = ggplot(dfi, aes(x=longitude, y=latitude, color=loading)) +
        geom_point() +
        scale_color_gradient(low = "yellow", high = "blue") +  # Modification des couleurs
        labs(title = paste("Stations loading - Monday",hours2[i],"h"))
}

p <- c(p1, p2)

do.call(grid.arrange,c(p, ncol=3))


In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
hours = c(18, 42, 66, 90, 114, 138, 162)
jours = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

par(mfrow = c(3, 3))

dfi = coord
p = list()
for (i in 1:length(hours)){
    dfi$loading = loading[,hours[i]]
    p[[i]] = ggplot(dfi, aes(x=longitude, y=latitude, color=loading)) +
        geom_point() +
        scale_color_gradient(low = "yellow", high = "blue") +  # Modification des couleurs
        labs(title = paste("Stations loading - ",jours[i]," ",hours[i]-24*(i-1),"h"))
}

do.call(grid.arrange,p)


In [None]:
print('--- Average fill rate ---')
print(mean(loading))

# --- #
print('')

print('--- Least crowded station, on average ---')
i = which.min(rowMeans(loading))
print(rowMeans(loading)[i])

# --- #
print('')

print('--- Fullest station, on average ---')
i = which.max(rowMeans(loading))
print(rowMeans(loading)[i])

In [None]:
# Calculer la moyenne du remplissage de chaque station
mean_loading <- rowMeans(loading)

# Trouver l'indice de la station la moins remplie et la plus remplie en moyenne
station_min <- which.min(mean_loading)
station_max <- which.max(mean_loading)

# Créer un nouveau dataframe avec les coordonnées des stations et leur remplissage moyen
df_map <- data.frame(latitude = coord$latitude,
                     longitude = coord$longitude,
                     mean_loading = mean_loading)

# Tracer la carte avec ggplot2
ggplot(df_map, aes(x = longitude, y = latitude)) +
  geom_point(aes(shape = "Autres stations"), color = "pink", size = 3) +
  geom_point(data = df_map[station_min, ], aes(shape = "Station la moins remplie en moyenne"), color = "yellow", size = 5) +
  geom_point(data = df_map[station_max, ], aes(shape = "Station la plus remplie en moyenne"), color = "blue", size = 5) +
  scale_shape_manual(name = "Legend", values = c("Station la moins remplie en moyenne" = 18, "Station la plus remplie en moyenne" = 18, "Autres stations" = 16)) +
  labs(title = "Carte des stations de Vélib à Paris") +
  theme(plot.title = element_text(size = 30, hjust = 0.5))

In [None]:
# Coordonnées de points le long de la Seine à Paris (environ)
seine_paris_dessus <- data.frame(
  latitude = c(48.829, 48.841, 48.851, 48.864, 48.872, 48.874, 48.871, 48.865, 48.859, 48.849, 48.840, 48.834, 48.824),
  longitude = c(2.246, 2.258, 2.272, 2.286, 2.299, 2.316, 2.334, 2.346, 2.359, 2.371, 2.382, 2.389, 2.400)
)

seine_paris_dessous <- data.frame(
  latitude = c(48.821, 48.833, 48.843, 48.850, 48.856, 48.857, 48.858, 48.855, 48.852, 48.847, 48.841, 48.835, 48.829, 48.814),
  longitude = c(2.261, 2.277, 2.283, 2.295, 2.302, 2.310, 2.319, 2.326, 2.336, 2.347, 2.355, 2.367, 2.374, 2.391)
)

# Fit parabolique
fit_dessus <- lm(latitude ~ poly(longitude, 2, raw=TRUE), data = seine_paris_dessus)
fit_dessous <- lm(latitude ~ poly(longitude, 2, raw=TRUE), data = seine_paris_dessous)

point_entre_paraboles <- function(x, y, a1, b1, c1, a2, b2, c2) {
  y1 <- a1 * x^2 + b1 * x + c1
  y2 <- a2 * x^2 + b2 * x + c2
  return(y > min(y1, y2) & y < max(y1, y2))
}

localisation <- list()

for (i in 1:nrow(coord)) {
  if (point_entre_paraboles(coord[i, "longitude"], coord[i, "latitude"], fit_dessus$coefficients[3], fit_dessus$coefficients[2], fit_dessus$coefficients[1], fit_dessous$coefficients[3], fit_dessous$coefficients[2], fit_dessous$coefficients[1])) {
    localisation[[i]] <- 1
  } else {
    localisation[[i]] <- 0
  }
}

# Ajout de la colonne 'localisation' à coord
coord$localisation <- unlist(localisation)


In [None]:
# Indices des cases avec des 0 dans la colonne localisation
indices_autres <- which(coord$localisation == 0)

# Indices des cases avec des 1 dans la colonne localisation
indices_seine <- which(coord$localisation == 1)

# Indices des cases avec des 0 dans la colonne bonus
indices_bas <- which(coord$bonus == 0)

# Indices des cases avec des 1 dans la colonne bonus
indices_haut <- which(coord$bonus == 1)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 7)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(color = "Autres"), size = 2) +
    geom_point(data = coord[indices_seine, ], aes(color = "Proche de la Seine"), size = 2) +
    scale_color_manual(name = "Legend", values = c("Autres" = "yellow", "Proche de la Seine" = "blue")) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 20, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(color = "Vallée"), size = 2) +
    geom_point(data = coord[indices_haut, ], aes(color = "Colline"), size = 2) +
    scale_color_manual(name = "Legend", values = c("Vallée" = "green", "Colline" = "red")) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 20, hjust = 0.5)),

  ncol=2
)

# Analyse en Composantes Principales (ACP)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
#par(mfrow = c(1,2))

pca <- PCA(loading, scale.unit = TRUE, graph = FALSE)
eig.val <- get_eigenvalue(pca)

fviz_eig(pca, addlabels = TRUE, ylim = c(0,50), barfill = "sky blue", main="Pourcentage de variance expliquée par composante principale", xlab="Composantes Principales", ylab="Pourcentage de Variance Expliquée")
plot(pca$eig[,3], type = "l", col = "sky blue",
    xlab = "Number of components", ylab = "Cumulative explained variance")
lines(pca$eig[,3], type = "l", col = "lightblue", lwd = 3)

In [None]:
boxplot(as.data.frame(pca$ind$coord[,1:5]), col="sky blue", main = "Box plots of the first five principal components")
abline(h = 0, lty = "dashed", col = "red")

## Corrélations des variables

In [None]:
options(repr.plot.width = 15, repr.plot.height = 15)

ggarrange(
  fviz_pca_var(pca, col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  fviz_pca_var(pca, axes = c(1,3), col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  fviz_pca_var(pca, axes = c(1,4), col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  fviz_pca_var(pca, axes = c(1,5), col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  fviz_pca_var(pca, axes = c(2,3), col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  fviz_pca_var(pca, axes = c(2,4), col.var = "cos2", repel = TRUE, gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")),
  ncol=2
)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 12)

timeTick = 1 + 24* (0:6) # vecteur délimitant les jours

par(mfrow = c(3,2))

plot(pca$var$coord [,1], type='l', xlab="Heures de la semaine", ylab="Corrélation avec dim 1", ylim=c(-1,1), cex.lab = 1.7)
abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink', lty = "dashed")

plot(pca$var$coord [,2], type='l', xlab="Heures de la semaine", ylab="Corrélation avec à dim 2", ylim=c(-1,1), cex.lab = 1.7)
abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+8, col='pink', lty = "dashed") + abline(v=timeTick+19, col='pink', lty = "dashed")

plot(pca$var$coord [,3], type='l', xlab="Heures de la semaine", ylab="Corrélation avec à dim 3", ylim=c(-1,1), cex.lab = 1.7)
abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684")

plot(pca$var$coord [,4], type='l', xlab="Heures de la semaine", ylab="Corrélation avec à dim 4", ylim=c(-1,1), cex.lab = 1.7)
abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684")

plot(pca$var$coord [,5], type='l', xlab="Heures de la semaine", ylab="Corrélation avec à dim 5", ylim=c(-1,1), cex.lab = 1.7)
abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684")

In [None]:
coord$couleurs1 = as.factor(coord$localisation)
levels(coord$couleurs1) = c('Autres','Seine')
coord$couleurs2 = as.factor(coord$bonus)
levels(coord$couleurs2) = c('Vallée','Coline')

In [None]:
# Pour les dimensions 1 et 2

options(repr.plot.width = 10, repr.plot.height = 6)
fviz_pca_ind(pca, geom = c("point"), col.ind = "cornflowerblue") + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Graphe des individus - ACP")

In [None]:
# Pour les dimensions 1 et 3

options(repr.plot.width = 10, repr.plot.height = 6)
fviz_pca_ind(pca, geom = c("point"), col.ind = "cornflowerblue",axes = c(1, 3)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue"), axes = c(1, 3)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red"), axes = c(1, 3)) + ggtitle("Graphe des individus - ACP")

In [None]:
# Pour les dimensions 2 et 3

options(repr.plot.width = 10, repr.plot.height = 6)
fviz_pca_ind(pca, geom = c("point"), col.ind = "cornflowerblue",axes = c(2, 3)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue"), axes = c(2, 3)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red"), axes = c(2, 3)) + ggtitle("Graphe des individus - ACP")

In [None]:
# Pour les dimensions 2 et 4

options(repr.plot.width = 10, repr.plot.height = 6)
fviz_pca_ind(pca, geom = c("point"), col.ind = "cornflowerblue",axes = c(2, 3)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue"), axes = c(2, 4)) + ggtitle("Graphe des individus - ACP")
fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red"), axes = c(2, 4)) + ggtitle("Graphe des individus - ACP")

# Méthodes de Clustering

Il est possible d'effectuer le clustering sur le jeu de données entier ou sur le jeu de données réduit. Ce choix dépend de la taille de notre jeu de données notamment.

## Jeu de données réduit

In [None]:
loading_reduced = as.data.frame(pca$ind$coord[,1:5])

In [None]:
head(loading_reduced)

### Clustering avec la méthode $k$-means

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

fviz_nbclust(loading_reduced, FUNcluster=kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

In [None]:
fviz_nbclust(loading_reduced, FUNcluster=kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
# Silhouette plots, according to the number of clusters
options(repr.plot.width = 15, repr.plot.height = 6)

# With 2 clusters
reskmeans = kmeans(loading_reduced, centers=2)
sil = silhouette(reskmeans$cluster, dist(loading_reduced))
p1 = fviz_silhouette(sil)

# With 3 clusters
reskmeans = kmeans(loading_reduced, centers=3)
sil = silhouette(reskmeans$cluster, dist(loading_reduced))
p2 = fviz_silhouette(sil)

# With 4 clusters
reskmeans = kmeans(loading_reduced, centers=4)
sil = silhouette(reskmeans$cluster, dist(loading_reduced))
p3 = fviz_silhouette(sil)

grid.arrange(p1,p2,p3,ncol=2)

In [None]:
reskmeans1 = kmeans(loading_reduced, centers=6)
reskmeans2 = kmeans(loading_reduced, centers=4)
reskmeans3 = kmeans(loading_reduced, centers=3)

cluster_counts1 <- table(reskmeans1$cluster)
cluster_counts2 <- table(reskmeans2$cluster)
cluster_counts3 <- table(reskmeans3$cluster)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 4 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:5))
barplot(cluster_counts3, main = "Répartition des stations pour 3 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))

#### Avec 6 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_6 <- which(reskmeans1$cluster == 1)
cluster2_kmeans_6 <- which(reskmeans1$cluster == 2)
cluster3_kmeans_6 <- which(reskmeans1$cluster == 3)
cluster4_kmeans_6 <- which(reskmeans1$cluster == 4)
cluster5_kmeans_6 <- which(reskmeans1$cluster == 5)
cluster6_kmeans_6 <- which(reskmeans1$cluster == 6)

mean1_kmeans_6 <- colMeans(loading[cluster1_kmeans_6,])
mean2_kmeans_6 <- colMeans(loading[cluster2_kmeans_6,])
mean3_kmeans_6 <- colMeans(loading[cluster3_kmeans_6,])
mean4_kmeans_6 <- colMeans(loading[cluster4_kmeans_6,])
mean5_kmeans_6 <- colMeans(loading[cluster5_kmeans_6,])
mean6_kmeans_6 <- colMeans(loading[cluster6_kmeans_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_6, col = 3)
lines(x, mean3_kmeans_6, col = 4)
lines(x, mean4_kmeans_6, col = 5)
lines(x, mean5_kmeans_6, col = 6)
lines(x, mean6_kmeans_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_kmeans_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_kmeans_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_kmeans_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_kmeans_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_kmeans_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_kmeans_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_kmeans_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_kmeans_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_kmeans_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans1$cluster)
tbl2 = table(coord$bonus, reskmeans1$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 4 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_4 <- which(reskmeans2$cluster == 1)
cluster2_kmeans_4 <- which(reskmeans2$cluster == 2)
cluster3_kmeans_4 <- which(reskmeans2$cluster == 3)
cluster4_kmeans_4 <- which(reskmeans2$cluster == 4)

mean1_kmeans_4 <- colMeans(loading[cluster1_kmeans_4,])
mean2_kmeans_4 <- colMeans(loading[cluster2_kmeans_4,])
mean3_kmeans_4 <- colMeans(loading[cluster3_kmeans_4,])
mean4_kmeans_4 <- colMeans(loading[cluster4_kmeans_4,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 4 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_4, col = 3)
lines(x, mean3_kmeans_4, col = 4)
lines(x, mean4_kmeans_4, col = 5)

# Légende
legend("topright", legend = paste("Cluster", 1:4), col = c(2:5), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:5)) + ggtitle("Graphe des individus de l'ACP pour 4 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_kmeans_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_4, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_kmeans_4, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_kmeans_4, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_kmeans_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_kmeans_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
    labs(title = "Stations loading - 4 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans2$cluster)
tbl2 = table(coord$bonus, reskmeans2$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:5), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:5), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:5)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_3 <- which(reskmeans3$cluster == 1)
cluster2_kmeans_3 <- which(reskmeans3$cluster == 2)
cluster3_kmeans_3 <- which(reskmeans3$cluster == 3)

mean1_kmeans_3 <- colMeans(loading[cluster1_kmeans_3,])
mean2_kmeans_3 <- colMeans(loading[cluster2_kmeans_3,])
mean3_kmeans_3 <- colMeans(loading[cluster3_kmeans_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_3, col = 3)
lines(x, mean3_kmeans_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_kmeans_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_kmeans_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans3$cluster)
tbl2 = table(coord$bonus, reskmeans3$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Clustering agglomérant avec la méthode CAH

In [None]:
d = dist(loading_reduced, method="euclidean")

# Clustering
hclustsingle = hclust(d, method="single")
hclustcomplete = hclust(d, method="complete")
hclustaverage = hclust(d, method="average")

#Dendograms visualization
options(repr.plot.width=10, repr.plot.height=10)
fviz_dend(hclustsingle, show_labels=FALSE, main='Dendrogram - Single linkage')
fviz_dend(hclustcomplete, show_labels=FALSE, main='Dendrogram - Complete linkage')
fviz_dend(hclustaverage, show_labels=FALSE, main='Dendrogram - Average linkage')

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)

grid.arrange(
    fviz_nbclust(loading_reduced, FUNcluster=hcut, method="wss") + ggtitle("WSS according to nb of clusters"),
    fviz_nbclust(loading_reduced, FUNcluster=hcut, method="silhouette") + ggtitle("silhouette according to nb of clusters"),
    ncol=2
)

In [None]:
reshclust1 = cutree(hclustcomplete, 6)
reshclust2 = cutree(hclustcomplete, 4)
reshclust3 = cutree(hclustcomplete, 3)

cluster_counts1 <- table(reshclust1)
cluster_counts2 <- table(reshclust2)
cluster_counts3 <- table(reshclust3)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 4 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:5))
barplot(cluster_counts3, main = "Répartition des stations pour 3 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))

#### Avec 6 clusters

In [None]:
reshclust1 = cutree(hclustcomplete, 6)
fviz_dend(hclustcomplete, k=6, show_labels=FALSE, rect=TRUE, palette=c(2:7))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_6 <- which(reshclust1 == 1)
cluster2_cah_6 <- which(reshclust1 == 2)
cluster3_cah_6 <- which(reshclust1 == 3)
cluster4_cah_6 <- which(reshclust1 == 4)
cluster5_cah_6 <- which(reshclust1 == 5)
cluster6_cah_6 <- which(reshclust1 == 6)

mean1_cah_6 <- colMeans(loading[cluster1_cah_6,])
mean2_cah_6 <- colMeans(loading[cluster2_cah_6,])
mean3_cah_6 <- colMeans(loading[cluster3_cah_6,])
mean4_cah_6 <- colMeans(loading[cluster4_cah_6,])
mean5_cah_6 <- colMeans(loading[cluster5_cah_6,])
mean6_cah_6 <- colMeans(loading[cluster6_cah_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_6, col = 3)
lines(x, mean3_cah_6, col = 4)
lines(x, mean4_cah_6, col = 5)
lines(x, mean5_cah_6, col = 6)
lines(x, mean6_cah_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_cah_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_cah_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_cah_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_cah_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_cah_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust1)
tbl2 = table(coord$bonus, reshclust1)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 4 clusters

In [None]:
reshclust2 = cutree(hclustcomplete, 4)
fviz_dend(hclustcomplete, k=4, show_labels=FALSE, rect=TRUE, palette=c(2:5))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_4 <- which(reshclust2 == 1)
cluster2_cah_4 <- which(reshclust2 == 2)
cluster3_cah_4 <- which(reshclust2 == 3)
cluster4_cah_4 <- which(reshclust2 == 4)

mean1_cah_4 <- colMeans(loading[cluster1_cah_4,])
mean2_cah_4 <- colMeans(loading[cluster2_cah_4,])
mean3_cah_4 <- colMeans(loading[cluster3_cah_4,])
mean4_cah_4 <- colMeans(loading[cluster4_cah_4,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 4 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_4, col = 3)
lines(x, mean3_cah_4, col = 4)
lines(x, mean4_cah_4, col = 5)

# Légende
legend("topright", legend = paste("Cluster", 1:4), col = c(2:5), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 4 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_cah_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_4, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_cah_4, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_cah_4, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_cah_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_cah_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
    labs(title = "Stations loading - 4 Clusters - Méthode CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust2)
tbl2 = table(coord$bonus, reshclust2)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:5), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:5), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:5)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
reshclust3 = cutree(hclustcomplete, 3)
fviz_dend(hclustcomplete, k=3, show_labels=FALSE, rect=TRUE, palette=c(2:4))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_3 <- which(reshclust3 == 1)
cluster2_cah_3 <- which(reshclust3 == 2)
cluster3_cah_3 <- which(reshclust3 == 3)

mean1_cah_3 <- colMeans(loading[cluster1_cah_3,])
mean2_cah_3 <- colMeans(loading[cluster2_cah_3,])
mean3_cah_3 <- colMeans(loading[cluster3_cah_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_3, col = 3)
lines(x, mean3_cah_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_cah_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust3)
tbl2 = table(coord$bonus, reshclust3)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Clustering avec les Modèles Mixtes Gaussiens (GMM)

In [None]:
resBICall = mclustBIC(loading_reduced, G=2:20)
summary(resBICall)

resBICall = Mclust(loading_reduced, G=2:20)
summary(resBICall)

fviz_mclust(resBICall, what="BIC")

La solution optimale est 10 avec comme model VVE mais ici nous allons garder 6 et 3 par cohérence avec ce que nous avons fait précédemment.

In [None]:
resBIC1 = Mclust(loading_reduced, G=6, modelNames = "VVE")
resBIC2 = Mclust(loading_reduced, G=4, modelNames = "VVE")
resBIC3 = Mclust(loading_reduced, G=3, modelNames = "VVE")

cluster_counts1 <- table(resBIC1$classification)
cluster_counts2 <- table(resBIC2$classification)
cluster_counts3 <- table(resBIC3$classification)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 4 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:5))
barplot(cluster_counts3, main = "Répartition des stations pour 3 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))

Ici, on ne fait que la méthode avec BIC et non ICL car les résultats des deux méthodes sont identiques

#### Avec 6 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_6 <- which(resBIC1$classification == 1)
cluster2_gmm_6 <- which(resBIC1$classification == 2)
cluster3_gmm_6 <- which(resBIC1$classification == 3)
cluster4_gmm_6 <- which(resBIC1$classification == 4)
cluster5_gmm_6 <- which(resBIC1$classification == 5)
cluster6_gmm_6 <- which(resBIC1$classification == 6)

mean1_gmm_6 <- colMeans(loading[cluster1_gmm_6,])
mean2_gmm_6 <- colMeans(loading[cluster2_gmm_6,])
mean3_gmm_6 <- colMeans(loading[cluster3_gmm_6,])
mean4_gmm_6 <- colMeans(loading[cluster4_gmm_6,])
mean5_gmm_6 <- colMeans(loading[cluster5_gmm_6,])
mean6_gmm_6 <- colMeans(loading[cluster6_gmm_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_6, col = 3)
lines(x, mean3_gmm_6, col = 4)
lines(x, mean4_gmm_6, col = 5)
lines(x, mean5_gmm_6, col = 6)
lines(x, mean6_gmm_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_gmm_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_gmm_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_gmm_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_gmm_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_gmm_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC1$classification)
tbl2 = table(coord$bonus, resBIC1$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 4 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_4 <- which(resBIC2$classification == 1)
cluster2_gmm_4 <- which(resBIC2$classification == 2)
cluster3_gmm_4 <- which(resBIC2$classification == 3)
cluster4_gmm_4 <- which(resBIC2$classification == 4)

mean1_gmm_4 <- colMeans(loading[cluster1_gmm_4,])
mean2_gmm_4 <- colMeans(loading[cluster2_gmm_4,])
mean3_gmm_4 <- colMeans(loading[cluster3_gmm_4,])
mean4_gmm_4 <- colMeans(loading[cluster4_gmm_4,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 4 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_4, col = 3)
lines(x, mean3_gmm_4, col = 4)
lines(x, mean4_gmm_4, col = 5)

# Légende
legend("topright", legend = paste("Cluster", 1:4), col = c(2:5), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:5)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_gmm_4, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_4, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_gmm_4, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_gmm_4, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_gmm_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_gmm_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
    labs(title = "Stations loading - 4 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC2$classification)
tbl2 = table(coord$bonus, resBIC2$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:5), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:5), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:5)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_3 <- which(resBIC3$classification == 1)
cluster2_gmm_3 <- which(resBIC3$classification == 2)
cluster3_gmm_3 <- which(resBIC3$classification == 3)

mean1_gmm_3 <- colMeans(loading[cluster1_gmm_3,])
mean2_gmm_3 <- colMeans(loading[cluster2_gmm_3,])
mean3_gmm_3 <- colMeans(loading[cluster3_gmm_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_3, col = 3)
lines(x, mean3_gmm_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_gmm_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC3$classification)
tbl2 = table(coord$bonus, resBIC3$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Comparaison des méthodes

In [None]:
matchClasses <- function(classif1, classif2) {
  cm <- table(classif1, classif2)
  K <- nrow(cm)
  a <- integer(K)
  b <- integer(K)

  for (j in 1:K) {
    for (i in 1:K) {
      if (a[j] < cm[i, j]) {
        a[j] <- cm[i, j]
        b[j] <- i
      }
    }
  }

  clusters <- classif2
  n <- length(classif2)
  for (i in 1:n) {
    for (j in 1:K) {
      if (classif2[i] == j) {
        clusters[i] <- b[j]
      }
    }
  }

  return (clusters)
}

#### Avec 6 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("K-means pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("CAH pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("GMM pour 6 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC1$classification, reskmeans1$cluster)
conf_mat = confusion_matrix(targets=resBIC1$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_62 <- which(new_cluster1 == 1)
cluster2_kmeans_62 <- which(new_cluster1 == 2)
cluster3_kmeans_62 <- which(new_cluster1 == 3)
cluster4_kmeans_62 <- which(new_cluster1 == 4)
cluster5_kmeans_62 <- which(new_cluster1 == 5)
cluster6_kmeans_62 <- which(new_cluster1 == 6)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC1$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust1, reskmeans1$cluster)
conf_mat = confusion_matrix(targets=reshclust1, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_63 <- which(new_cluster2 == 1)
cluster2_kmeans_63 <- which(new_cluster2 == 2)
cluster3_kmeans_63 <- which(new_cluster2 == 3)
cluster5_kmeans_63 <- which(new_cluster2 == 5)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_63, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_63, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_63, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster5_kmeans_63, ], aes(shape = "cluster5"), color = 6, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster5" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust1)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_63, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_63, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_63, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster5_kmeans_63, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster5" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC1$classification, reshclust1)
conf_mat = confusion_matrix(targets=resBIC1$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_62 <- which(new_cluster3 == 1)
cluster2_cah_62 <- which(new_cluster3 == 2)
cluster3_cah_62 <- which(new_cluster3 == 3)
cluster4_cah_62 <- which(new_cluster3 == 4)
cluster5_cah_62 <- which(new_cluster3 == 5)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC1$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

#### Avec 4 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:5)) + ggtitle("K-means pour 4 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:5)) + ggtitle("CAH pour 4 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:5)) + ggtitle("GMM pour 4 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC2$classification, reskmeans2$cluster)
conf_mat = confusion_matrix(targets=resBIC2$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster2_kmeans_42 <- which(new_cluster1 == 2)
cluster4_kmeans_42 <- which(new_cluster1 == 4)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_kmeans_42, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster4_kmeans_42, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_kmeans_42, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster4_kmeans_42, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster4" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust2, reskmeans2$cluster)
conf_mat = confusion_matrix(targets=reshclust2, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_43 <- which(new_cluster2 == 1)
cluster3_kmeans_43 <- which(new_cluster2 == 3)
cluster4_kmeans_43 <- which(new_cluster2 == 4)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_43, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster3_kmeans_43, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_43, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust2)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_43, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster3_kmeans_43, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_43, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster3" = 18, "cluster4" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC2$classification, reshclust2)
conf_mat = confusion_matrix(targets=resBIC2$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_42 <- which(new_cluster3 == 1)
cluster2_cah_42 <- which(new_cluster3 == 2)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_42, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_42, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_4, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_4, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_4, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_4, ], aes(shape = "cluster4"), color = 5, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_42, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_42, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 4 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

#### Avec 3 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2:7)) + ggtitle("K-means pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2:7)) + ggtitle("CAH pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2:7)) + ggtitle("GMM pour 6 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC3$classification, reskmeans3$cluster)
conf_mat = confusion_matrix(targets=resBIC3$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster2_kmeans_32 <- which(new_cluster1 == 2)
cluster3_kmeans_32 <- which(new_cluster1 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_kmeans_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_kmeans_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust3, reskmeans3$cluster)
conf_mat = confusion_matrix(targets=reshclust3, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_33 <- which(new_cluster2 == 1)
cluster2_kmeans_33 <- which(new_cluster2 == 2)
cluster3_kmeans_33 <- which(new_cluster2 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_33, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_33, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_33, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust2)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_33, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_33, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_33, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC3$classification, reshclust3)
conf_mat = confusion_matrix(targets=resBIC3$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster2_cah_32 <- which(new_cluster3 == 2)
cluster3_cah_32 <- which(new_cluster3 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_cah_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster2_cah_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster2" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

## Jeu de données complet

In [None]:
loading_total <- as.data.frame(loading)

In [None]:
head(loading_total)

### Clustering avec la méthode $k$-means

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

fviz_nbclust(loading_total, FUNcluster=kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

In [None]:
fviz_nbclust(loading_total, FUNcluster=kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
# Silhouette plots, according to the number of clusters
options(repr.plot.width = 15, repr.plot.height = 6)

# With 2 clusters
reskmeans = kmeans(loading_total, centers=2)
sil = silhouette(reskmeans$cluster, dist(loading_total))
p1 = fviz_silhouette(sil)

# With 3 clusters
reskmeans = kmeans(loading_total, centers=3)
sil = silhouette(reskmeans$cluster, dist(loading_total))
p2 = fviz_silhouette(sil)

# With 4 clusters
reskmeans = kmeans(loading_total, centers=4)
sil = silhouette(reskmeans$cluster, dist(loading_total))
p3 = fviz_silhouette(sil)

grid.arrange(p1,p2,p3,ncol=2)

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)

reskmeans1 = kmeans(loading_total, centers=6)
reskmeans2 = kmeans(loading_total, centers=3)
reskmeans3 = kmeans(loading_total, centers=2)

cluster_counts1 <- table(reskmeans1$cluster)
cluster_counts2 <- table(reskmeans2$cluster)
cluster_counts3 <- table(reskmeans3$cluster)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 3 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))
barplot(cluster_counts3, main = "Répartition des stations pour 2 clusters - K-means", xlab = "Cluster", ylab = "Nombre de stations", col = c(2,4))

#### Avec 6 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_6 <- which(reskmeans1$cluster == 1)
cluster2_kmeans_6 <- which(reskmeans1$cluster == 2)
cluster3_kmeans_6 <- which(reskmeans1$cluster == 3)
cluster4_kmeans_6 <- which(reskmeans1$cluster == 4)
cluster5_kmeans_6 <- which(reskmeans1$cluster == 5)
cluster6_kmeans_6 <- which(reskmeans1$cluster == 6)

mean1_kmeans_6 <- colMeans(loading[cluster1_kmeans_6,])
mean2_kmeans_6 <- colMeans(loading[cluster2_kmeans_6,])
mean3_kmeans_6 <- colMeans(loading[cluster3_kmeans_6,])
mean4_kmeans_6 <- colMeans(loading[cluster4_kmeans_6,])
mean5_kmeans_6 <- colMeans(loading[cluster5_kmeans_6,])
mean6_kmeans_6 <- colMeans(loading[cluster6_kmeans_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_6, col = 3)
lines(x, mean3_kmeans_6, col = 4)
lines(x, mean4_kmeans_6, col = 5)
lines(x, mean5_kmeans_6, col = 6)
lines(x, mean6_kmeans_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_kmeans_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_kmeans_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_kmeans_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_kmeans_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_kmeans_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_kmeans_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_kmeans_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_kmeans_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_kmeans_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans1$cluster)
tbl2 = table(coord$bonus, reskmeans1$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_3 <- which(reskmeans2$cluster == 1)
cluster2_kmeans_3 <- which(reskmeans2$cluster == 2)
cluster3_kmeans_3 <- which(reskmeans2$cluster == 3)

mean1_kmeans_3 <- colMeans(loading[cluster1_kmeans_3,])
mean2_kmeans_3 <- colMeans(loading[cluster2_kmeans_3,])
mean3_kmeans_3 <- colMeans(loading[cluster3_kmeans_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_3, col = 3)
lines(x, mean3_kmeans_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_kmeans_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_kmeans_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans2$cluster)
tbl2 = table(coord$bonus, reskmeans2$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

#### Avec 2 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_2 <- which(reskmeans3$cluster == 1)
cluster2_kmeans_2 <- which(reskmeans3$cluster == 2)

mean1_kmeans_2 <- colMeans(loading[cluster1_kmeans_2,])
mean2_kmeans_2 <- colMeans(loading[cluster2_kmeans_2,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 2 clusters - K-means", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_kmeans_2, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:2), col = c(2,4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2,4)) + ggtitle("Graphe des individus de l'ACP pour 2 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

# Affichage des courbes
plot(x, mean1_kmeans_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_kmeans_2, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_kmeans_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_kmeans_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
    labs(title = "Stations loading - 2 Clusters - Méthode K-means") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reskmeans3$cluster)
tbl2 = table(coord$bonus, reskmeans3$cluster)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2,4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2,4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2,4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Clustering agglomérant avec la méthode CAH

In [None]:
d = dist(loading_total, method="euclidean")

# Clustering
hclustsingle = hclust(d, method="single")
hclustcomplete = hclust(d, method="complete")
hclustaverage = hclust(d, method="average")

#Dendograms visualization
options(repr.plot.width=10, repr.plot.height=10)
fviz_dend(hclustsingle, show_labels=FALSE, main='Dendrogram - Single linkage')
fviz_dend(hclustcomplete, show_labels=FALSE, main='Dendrogram - Complete linkage')
fviz_dend(hclustaverage, show_labels=FALSE, main='Dendrogram - Average linkage')

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)

grid.arrange(
    fviz_nbclust(loading_total, FUNcluster=hcut, method="wss") + ggtitle("WSS according to nb of clusters"),
    fviz_nbclust(loading_total, FUNcluster=hcut, method="silhouette") + ggtitle("silhouette according to nb of clusters"),
    ncol=2
)

In [None]:
reshclust1 = cutree(hclustcomplete, 6)
reshclust2 = cutree(hclustcomplete, 3)
reshclust3 = cutree(hclustcomplete, 2)

cluster_counts1 <- table(reshclust1)
cluster_counts2 <- table(reshclust2)
cluster_counts3 <- table(reshclust3)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 3 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))
barplot(cluster_counts3, main = "Répartition des stations pour 2 clusters - CAH", xlab = "Cluster", ylab = "Nombre de stations", col = c(2,4))

#### Avec 6 clusters

In [None]:
reshclust1 = cutree(hclustcomplete, 6)
fviz_dend(hclustcomplete, k=6, show_labels=FALSE, rect=TRUE, palette=c(2:7))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_6 <- which(reshclust1 == 1)
cluster2_cah_6 <- which(reshclust1 == 2)
cluster3_cah_6 <- which(reshclust1 == 3)
cluster4_cah_6 <- which(reshclust1 == 4)
cluster5_cah_6 <- which(reshclust1 == 5)
cluster6_cah_6 <- which(reshclust1 == 6)

mean1_cah_6 <- colMeans(loading[cluster1_cah_6,])
mean2_cah_6 <- colMeans(loading[cluster2_cah_6,])
mean3_cah_6 <- colMeans(loading[cluster3_cah_6,])
mean4_cah_6 <- colMeans(loading[cluster4_cah_6,])
mean5_cah_6 <- colMeans(loading[cluster5_cah_6,])
mean6_cah_6 <- colMeans(loading[cluster6_cah_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_6, col = 3)
lines(x, mean3_cah_6, col = 4)
lines(x, mean4_cah_6, col = 5)
lines(x, mean5_cah_6, col = 6)
lines(x, mean6_cah_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_cah_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_cah_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_cah_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_cah_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_cah_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust1)
tbl2 = table(coord$bonus, reshclust1)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
reshclust2 = cutree(hclustcomplete, 3)
fviz_dend(hclustcomplete, k=3, show_labels=FALSE, rect=TRUE, palette=c(2:4))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_3 <- which(reshclust2 == 1)
cluster2_cah_3 <- which(reshclust2 == 2)
cluster3_cah_3 <- which(reshclust2 == 3)

mean1_cah_3 <- colMeans(loading[cluster1_cah_3,])
mean2_cah_3 <- colMeans(loading[cluster2_cah_3,])
mean3_cah_3 <- colMeans(loading[cluster3_cah_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_3, col = 3)
lines(x, mean3_cah_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_cah_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust2)
tbl2 = table(coord$bonus, reshclust2)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

#### Avec 2 clusters

In [None]:
reshclust3 = cutree(hclustcomplete, 2)
fviz_dend(hclustcomplete, k=2, show_labels=FALSE, rect=TRUE, palette=c(2:3))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_2 <- which(reshclust3 == 1)
cluster2_cah_2 <- which(reshclust3 == 2)

mean1_cah_2 <- colMeans(loading[cluster1_cah_2,])
mean2_cah_2 <- colMeans(loading[cluster2_cah_2,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 2 clusters - CAH", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_cah_2, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:2), col = c(2,4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2,4)) + ggtitle("Graphe des individus de l'ACP pour 2 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

# Affichage des courbes
plot(x, mean1_cah_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_cah_2, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_cah_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_cah_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
    labs(title = "Stations loading - 2 Clusters - CAH") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, reshclust3)
tbl2 = table(coord$bonus, reshclust3)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2,4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2,4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2,4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Clustering avec les Modèles Mixtes Gaussiens (GMM)

In [None]:
resBICall = mclustBIC(loading_total, G=2:20)
summary(resBICall)

resBICall = Mclust(loading_total, G=2:20)
summary(resBICall)

fviz_mclust(resBICall, what="BIC")

In [None]:
resBIC1 = Mclust(loading_total, G=6, modelNames = "EEI")
resBIC2 = Mclust(loading_total, G=3, modelNames = "EEI")
resBIC3 = Mclust(loading_total, G=2, modelNames = "EEE")

cluster_counts1 <- table(resBIC1$classification)
cluster_counts2 <- table(resBIC2$classification)
cluster_counts3 <- table(resBIC3$classification)

barplot(cluster_counts1, main = "Répartition des stations pour 6 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:7))
barplot(cluster_counts2, main = "Répartition des stations pour 3 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2:4))
barplot(cluster_counts3, main = "Répartition des stations pour 2 clusters - GMM", xlab = "Cluster", ylab = "Nombre de stations", col = c(2,4))

Ici, on ne fait que la méthode avec BIC et non ICL car les résultats des deux méthodes sont identiques

#### Avec 6 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_6 <- which(resBIC1$classification == 1)
cluster2_gmm_6 <- which(resBIC1$classification == 2)
cluster3_gmm_6 <- which(resBIC1$classification == 3)
cluster4_gmm_6 <- which(resBIC1$classification == 4)
cluster5_gmm_6 <- which(resBIC1$classification == 5)
cluster6_gmm_6 <- which(resBIC1$classification == 6)

mean1_gmm_6 <- colMeans(loading[cluster1_gmm_6,])
mean2_gmm_6 <- colMeans(loading[cluster2_gmm_6,])
mean3_gmm_6 <- colMeans(loading[cluster3_gmm_6,])
mean4_gmm_6 <- colMeans(loading[cluster4_gmm_6,])
mean5_gmm_6 <- colMeans(loading[cluster5_gmm_6,])
mean6_gmm_6 <- colMeans(loading[cluster6_gmm_6,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 6 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_6, col = 3)
lines(x, mean3_gmm_6, col = 4)
lines(x, mean4_gmm_6, col = 5)
lines(x, mean5_gmm_6, col = 6)
lines(x, mean6_gmm_6, col = 7)

# Légende
legend("topright", legend = paste("Cluster", 1:6), col = c(2:7), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("Graphe des individus de l'ACP pour 6 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

plot(x, mean1_gmm_6, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_6, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_gmm_6, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean4_gmm_6, type = "l", col = 5, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 4", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean5_gmm_6, type = "l", col = 6, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 5", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean6_gmm_6, type = "l", col = 7, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 6", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
    geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
    geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
    geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
    labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
    geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
    labs(title = "Stations proches ou non de la Seine") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC1$classification)
tbl2 = table(coord$bonus, resBIC1$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:7), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:7), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),

    ncol=3
)

#### Avec 3 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_3 <- which(resBIC2$classification == 1)
cluster2_gmm_3 <- which(resBIC2$classification == 2)
cluster3_gmm_3 <- which(resBIC2$classification == 3)

mean1_gmm_3 <- colMeans(loading[cluster1_gmm_3,])
mean2_gmm_3 <- colMeans(loading[cluster2_gmm_3,])
mean3_gmm_3 <- colMeans(loading[cluster3_gmm_3,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 3 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_3, col = 3)
lines(x, mean3_gmm_3, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:3), col = c(2:4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:4)) + ggtitle("Graphe des individus de l'ACP pour 3 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
#timeTick = 1 + 24* (0:6)
#abline(v=timeTick, col="#048B9A") + abline(h=0, col="#A10684") + abline(v=timeTick+11, col='pink')

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_3, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_3, type = "l", col = 3, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean3_gmm_3, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 3", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
    geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
    labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC2$classification)
tbl2 = table(coord$bonus, resBIC2$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2:4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2:4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

#### Avec 2 clusters

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_gmm_2 <- which(resBIC3$classification == 1)
cluster2_gmm_2 <- which(resBIC3$classification == 2)

mean1_gmm_2 <- colMeans(loading[cluster1_gmm_2,])
mean2_gmm_2 <- colMeans(loading[cluster2_gmm_2,])

x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Valeur mesurée pour 2 clusters - GMM", xlim = c(1, 168), ylim = c(0, 1))
lines(x, mean2_gmm_2, col = 4)

# Légende
legend("topright", legend = paste("Cluster", 1:2), col = c(2,4), lty = 1)

fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2,4)) + ggtitle("Graphe des individus de l'ACP pour 2 clusters")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

In [None]:
x <- 1:168

# Affichage des courbes
plot(x, mean1_gmm_2, type = "l", col = 2, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 1", xlim = c(1, 168), ylim = c(0, 1))
plot(x, mean2_gmm_2, type = "l", col = 4, xlab = "Heures de la semaine", ylab = "Valeur mesurée", main = "Cluster 2", xlim = c(1, 168), ylim = c(0, 1))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

grid.arrange(
  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[cluster1_gmm_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
    geom_point(data = coord[cluster2_gmm_2, ], aes(shape = "cluster2"), color = 4, size = 2) +
    scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
    labs(title = "Stations loading - 2 Clusters - Méthode GMM") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
    geom_point(data = coord[indices_bas, ], aes(shape = "Vallée"), color = "green", size = 2) +
    geom_point(data = coord[indices_haut, ], aes(shape = "Colline"), color = "red", size = 2) +
    scale_shape_manual(name = "Legend", values = c("Vallée" = 18, "Colline" = 18)) +
    labs(title = "Altitude des stations") +
    theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[indices_autres, ], aes(shape = "Autres"), color = "yellow", size = 2) +
  geom_point(data = coord[indices_seine, ], aes(shape = "Proche de la Seine"), color = "blue", size = 2) +
  scale_shape_manual(name = "Legend", values = c("Autres" = 18, "Proche de la Seine" = 18)) +
  labs(title = "Stations proches ou non de la Seine") +
  theme(plot.title = element_text(size = 15, hjust = 0.5)),

  ncol=3
)

In [None]:
tbl1 = table(coord$localisation, resBIC3$classification)
tbl2 = table(coord$bonus, resBIC3$classification)

options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(
  mosaicplot(tbl1, color=c(2,4), main="Mosaicplot clusters-Seine"),
  mosaicplot(tbl2, color=c(2,4), main="Mosaicplot clusters-altitude"),
  ncol=2
)

options(repr.plot.width = 20, repr.plot.height = 6)
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2,4)) + ggtitle("Individuals factor map - PCA "),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs2, palette = c("green","red")) + ggtitle("Individuals factor map - PCA"),
    fviz_pca_ind(pca, geom = c("point"), col.ind = coord$couleurs1, palette = c("yellow","blue")) + ggtitle("Individuals factor map - PCA "),
    ncol=3
)

### Comparaison des méthodes

In [None]:
matchClasses <- function(classif1, classif2) {
  cm <- table(classif1, classif2)
  K <- nrow(cm)
  a <- integer(K)
  b <- integer(K)

  for (j in 1:K) {
    for (i in 1:K) {
      if (a[j] < cm[i, j]) {
        a[j] <- cm[i, j]
        b[j] <- i
      }
    }
  }

  clusters <- classif2
  n <- length(classif2)
  for (i in 1:n) {
    for (j in 1:K) {
      if (classif2[i] == j) {
        clusters[i] <- b[j]
      }
    }
  }

  return (clusters)
}

#### Avec 6 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans1$cluster), palette=c(2:7)) + ggtitle("K-means pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust1), palette=c(2:7)) + ggtitle("CAH pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC1$classification), palette=c(2:7)) + ggtitle("GMM pour 6 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC1$classification, reskmeans1$cluster)
conf_mat = confusion_matrix(targets=resBIC1$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_62 <- which(new_cluster1 == 1)
cluster2_kmeans_62 <- which(new_cluster1 == 2)
cluster3_kmeans_62 <- which(new_cluster1 == 3)
cluster4_kmeans_62 <- which(new_cluster1 == 4)
cluster5_kmeans_62 <- which(new_cluster1 == 5)
cluster6_kmeans_62 <- which(new_cluster1 == 6)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC1$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_62, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_kmeans_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust1, reskmeans1$cluster)
conf_mat = confusion_matrix(targets=reshclust1, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_63 <- which(new_cluster2 == 1)
cluster2_kmeans_63 <- which(new_cluster2 == 2)
cluster3_kmeans_63 <- which(new_cluster2 == 3)
cluster4_kmeans_63 <- which(new_cluster2 == 4)
cluster5_kmeans_63 <- which(new_cluster2 == 5)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_63, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_63, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_63, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_63, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_63, ], aes(shape = "cluster5"), color = 6, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_cah_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust1)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_63, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_63, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_63, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_kmeans_63, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_kmeans_63, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC1$classification, reshclust1)
conf_mat = confusion_matrix(targets=resBIC1$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_62 <- which(new_cluster3 == 1)
cluster2_cah_62 <- which(new_cluster3 == 2)
cluster4_cah_62 <- which(new_cluster3 == 4)
cluster5_cah_62 <- which(new_cluster3 == 5)
cluster6_cah_62 <- which(new_cluster3 == 6)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster4_cah_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_6, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_6, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_6, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster4_gmm_6, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_gmm_6, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_gmm_6, ], aes(shape = "cluster6"), color = 7, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC1$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_62, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_62, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster4_cah_62, ], aes(shape = "cluster4"), color = 5, size = 2) +
  geom_point(data = coord[cluster5_cah_62, ], aes(shape = "cluster5"), color = 6, size = 2) +
  geom_point(data = coord[cluster6_cah_62, ], aes(shape = "cluster6"), color = 7, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster4" = 18, "cluster5" = 18, "cluster6" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 6 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

#### Avec 3 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans2$cluster), palette=c(2:7)) + ggtitle("K-means pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust2), palette=c(2:7)) + ggtitle("CAH pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC2$classification), palette=c(2:7)) + ggtitle("GMM pour 6 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC2$classification, reskmeans2$cluster)
conf_mat = confusion_matrix(targets=resBIC2$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_32 <- which(new_cluster1 == 1)
cluster3_kmeans_32 <- which(new_cluster1 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_32, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster3_kmeans_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_32, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster3_kmeans_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust2, reskmeans2$cluster)
conf_mat = confusion_matrix(targets=reshclust2, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_33 <- which(new_cluster2 == 1)
cluster2_kmeans_33 <- which(new_cluster2 == 2)
cluster3_kmeans_33 <- which(new_cluster2 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_33, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_33, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_33, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust2)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_33, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_33, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_kmeans_33, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC2$classification, reshclust2)
conf_mat = confusion_matrix(targets=resBIC2$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_32 <- which(new_cluster3 == 1)
cluster2_cah_32 <- which(new_cluster3 == 2)
cluster3_cah_32 <- which(new_cluster3 == 3)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_32, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_3, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_3, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_gmm_3, ], aes(shape = "cluster3"), color = 4, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC2$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_32, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_32, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster3_cah_32, ], aes(shape = "cluster3"), color = 4, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "cluster3" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 3 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

#### Avec 2 clusters

In [None]:
grid.arrange(
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reskmeans3$cluster), palette=c(2:7)) + ggtitle("K-means pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(reshclust3), palette=c(2:7)) + ggtitle("CAH pour 6 clusters"),
    fviz_pca_ind(pca, axes=c(1,2), geom=c("point"), habillage=as.factor(resBIC3$classification), palette=c(2:7)) + ggtitle("GMM pour 6 clusters"),
    ncol=3
)

##### k-means Vs GMM

In [None]:
new_cluster1 <- matchClasses(resBIC3$classification, reskmeans3$cluster)
conf_mat = confusion_matrix(targets=resBIC3$classification, predictions=new_cluster1)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_22 <- which(new_cluster1 == 1)
cluster2_kmeans_22 <- which(new_cluster1 == 2)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_22, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_22, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_2, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster1)
clusters <- new_cluster1
idx <- (new_cluster1 != resBIC3$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_22, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_22, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Comparaison K-means vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### k-means Vs CAH

In [None]:
new_cluster2 <- matchClasses(reshclust3, reskmeans3$cluster)
conf_mat = confusion_matrix(targets=reshclust3, predictions=new_cluster2)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_kmeans_23 <- which(new_cluster2 == 1)
cluster2_kmeans_23 <- which(new_cluster2 == 2)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_23, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_23, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode K-means") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_2, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster2)
clusters <- new_cluster2
idx <- (new_cluster2 != reshclust3)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_kmeans_23, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_kmeans_23, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Comparaison K-means vs CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

##### CAH Vs GMM

In [None]:
new_cluster3 <- matchClasses(resBIC3$classification, reshclust3)
conf_mat = confusion_matrix(targets=resBIC3$classification, predictions=new_cluster3)
plot_confusion_matrix(conf_mat)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

cluster1_cah_22 <- which(new_cluster3 == 1)
cluster2_cah_22 <- which(new_cluster3 == 2)

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_22, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_22, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode CAH") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_gmm_2, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_gmm_2, ], aes(shape = "cluster2"), color = 3, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Méthode GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))


n <- length(new_cluster3)
clusters <- new_cluster3
idx <- (new_cluster3 != resBIC3$classification)
clusters[idx] <- 7
cluster7 <- which(clusters == 7)
nb_diff <- length(cluster7)
ratio <- nb_diff/n

cat("Ratio du nb de points classés différemment:", ratio, "\n")
cat("Nb de stations classées différemment:", nb_diff, "\n")

ggplot(coord, aes(x = longitude, y = latitude)) +
  geom_point(data = coord[cluster1_cah_22, ], aes(shape = "cluster1"), color = 2, size = 2) +
  geom_point(data = coord[cluster2_cah_22, ], aes(shape = "cluster2"), color = 3, size = 2) +
  geom_point(data = coord[cluster7, ], aes(shape = "différence"), color = 1, size = 2) +
  scale_shape_manual(name = "Legend", values = c("cluster1" = 18, "cluster2" = 18, "différence" = 18)) +
  labs(title = "Stations loading - 2 Clusters - Comparaison CAH vs GMM") +
  theme(plot.title = element_text(size = 15, hjust = 0.5))

# Analyse en correspondances multiples (ACM)

## Sélection des variables

In [None]:
# Sélection des colonnes pour chaque plage horaire et calcul de leur moyenne
loading_heure <- data.frame(
  Moyenne_00h_03h = rowMeans(loading[, 1:4]),
  Moyenne_04h_06h = rowMeans(loading[, 5:7]),
  Moyenne_07h_09h = rowMeans(loading[, 8:10]),
  Moyenne_10h_12h = rowMeans(loading[, 11:13]),
  Moyenne_13h_16h = rowMeans(loading[, 14:17]),
  Moyenne_17h_19h = rowMeans(loading[, 18:20]),
  Moyenne_20h_23h = rowMeans(loading[, 21:24])
)

# Concaténation des dataframes loading_heure et coord
loading_heure <- cbind(loading_heure, coord[c('bonus', 'localisation')])

head(loading_heure)

In [None]:
# Variables qu'on ne veut pas affecter
colonnes_non_numeriques <- c('localisation', 'bonus')

# Création des classes d'intervalles
assign_class <- function(value) {
    if (value < 0.25) {
        return("[0;0.25]")
    } else if (value < 0.5) {
        return("[0.25;0.5]")
    } else if (value < 0.75) {
        return("[0.5;0.75]")
    } else {
        return("[0.75;1]")
    }
}

# Création d'une copie du dataframe loading_heure avec les classes d'intervalles
loading_heure_factor <- loading_heure

# Appliquer la fonction assign_class à toutes les colonnes sauf les colonnes non numériques
loading_heure_factor[, !names(loading_heure_factor) %in% colonnes_non_numeriques] <-
    apply(loading_heure_factor[, !names(loading_heure_factor) %in% colonnes_non_numeriques], 2, function(x) sapply(x, assign_class))

print(head(loading_heure_factor))

In [None]:
# Convertir les colonnes en catégorielles non ordonnées
loading_heure_factor$Moyenne_00h_03h <- as.factor(loading_heure_factor$Moyenne_00h_03h)
loading_heure_factor$Moyenne_04h_06h <- as.factor(loading_heure_factor$Moyenne_04h_06h)
loading_heure_factor$Moyenne_07h_09h <- as.factor(loading_heure_factor$Moyenne_07h_09h)
loading_heure_factor$Moyenne_10h_12h <- as.factor(loading_heure_factor$Moyenne_10h_12h)
loading_heure_factor$Moyenne_13h_16h <- as.factor(loading_heure_factor$Moyenne_13h_16h)
loading_heure_factor$Moyenne_17h_19h <- as.factor(loading_heure_factor$Moyenne_17h_19h)
loading_heure_factor$Moyenne_20h_23h <- as.factor(loading_heure_factor$Moyenne_20h_23h)
loading_heure_factor$bonus <- as.factor(loading_heure_factor$bonus)
loading_heure_factor$localisation <- as.factor(loading_heure_factor$localisation)

# Afficher les types de données
print(sapply(loading_heure_factor, class))

In [None]:
head(loading_heure_factor)

In [None]:
# Loading
par(mfrow=c(2,4))
I = 7
for (i in 1:I) {
  plot(loading_heure_factor[,i], main=colnames(loading_heure_factor)[i],
       ylab = "Count", col="steelblue", las = 2)
  }

# -- #

# Location
par(mfrow=c(1,2))
I = 2
for (i in 7+1:I) {
  plot(loading_heure_factor[,i], main=colnames(loading_heure_factor)[i],
       ylab = "Count", col="steelblue", las = 2)
  }

En moyenne et ce, quelle que soit l'heure de la journée, la plupart des stations sont vides ou contiennent peu de vélos. Ceci peut s'expliquer non seulement par le fait qu'il y a en permancence des usagers utilisant des Vélib', mais aussi par le fait qu'une partie des Vélib' peut être hors service (Vélib' non ramenés aux stations, en réparation...).

De plus, on remarque qu'entre 10h et 19h (et même un peu au delà visiblement), le nombre de stations peu remplies (remplissage entre 0 et 0.25) augmente. Ces horaires regroupent la fin de matinée, l'après-midi et le début de soirée. Ce sont des horaires d'activités. Il semble donc normal qu'un plus grand nombre d'usagers aient recours aux Vélib' pour se déplacer (pour le travail, pour aller d'une activité à une autre, changer de mode de transport....).

Enfin on remarque que les stations situées en altitude ou proches de la Seine sont une minorité par rapport à l'ensemble des stations.

## Représentation de l'ACM

In [None]:
res.mca = MCA(loading_heure_factor[c(1:7)], graph=FALSE)
print(res.mca)

In [None]:
head(res.mca$eig)

fviz_screeplot(res.mca, addlabels=TRUE)

La première dimension permet de conserver près de 19.6% de l'information et la seconde 10.6%. On obtient donc un peu plus de 30% de l'information initiale avec les deux premières dimensions seulement, ce qui est correct pour une Analyse en Correpondances Multpiles. Si on voulait conserver près de 80% de l'information comme on a l'habitude de le faire avec une ACP, alors il faudrait travailler avec les dix premières dimensions. On se rend bien sûr compte que cela deviendrait trop contraignant. Nous allons dans la suite uniquement travailler les dimensions 1 et 2 de notre ACM.

In [None]:
fviz_mca_var(res.mca, choice="mca.cor", repel=TRUE)

Excepté le créneau de 20h à 23h, toutes les plages horaires semblent très corrélées à la première dimension.

La seconde dimension semble quant à elle discriminer les heures en fonction du moment de la journée : les horaires "tôt" ou horaires "du matin" (de minuit à 9h) sont très corrélés à cette dimension, au contraire des autres tranches horaires.

In [None]:
fviz_mca_var(res.mca, col.var = rep(c("vide","presque vide","presque pleine","pleine"),7),
             title = "Graph des loading par heure",
             repel = TRUE)

Graphiquement on constate une réparition parabolique de la densité de Vélib' (par valeurs croissantes du remplissage) :
- pour toutes les plages horaire, les stations très peu remplies [0;0.25] sont globalement situées dans le troisème cadrant du graphe
- les stations peu remplies [0.25;0.5] sont majoritairement dans le quatrième cadrant
- les stations remplies [0.5;0.75] appartiennent au premier cadrant
- les stations très remplies sont situées au niveau du second cadrant


## Qualité de représentation des variables et des individus

In [None]:
head(res.mca$var$cos2)

# --- #

i = c(1:2)

idx = which.max(rowSums(res.mca$var$cos2[,i]) )
loading_heure_f = row.names(res.mca$var$cos2)[idx]

print(paste("L'heure et le chargement des stations les mieux représentés dans le plan", i[1], '-', i[2], "sont", loading_heure_f))

idx = which.min( rowSums(res.mca$var$cos2[,i]) )
loading_heure_f = row.names(res.mca$var$cos2)[idx]

print(paste("L'heure et le chargement des stations les moins bien représentés dans le plan", i[1], '-', i[2], "sont", loading_heure_f))

In [None]:
fviz_mca_var(res.mca, col.var = "cos2",
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE )

fviz_cos2(res.mca, choice = "var", axes = 1:2)

Les variables sont mieux représentées pour des valeurs extrêmes de la densité de Vélib', à savoir [0;0.25] ou [0.75;1]. En particulier, les horaires les mieux représentés sont ceux du "matin" (au sens défini plus tôt à savoir entre minuit et 9h), ce qui semble cohérent puisque nous avons vu précédemment qu'ils étaient corrélés à la fois à la dimension 1 et à la dimension 2.

In [None]:
fviz_mca_ind(res.mca, col.ind = "cos2",
             gradient.cols = c("blue", "yellow", "red"))

Les individus les mieux représentés sont ceux situés "au bord" du graphe, en particulier en bas à gauche du troisème cadrant et en bas à droite du second cadrant (tout comme le sont les variables finalement).

## Contribution des variables à l'ACM

In [None]:
fviz_contrib(res.mca, choice="var", axes=1, top=20)

fviz_contrib(res.mca, choice="var", axes=2, top=20)

En analysant les deux graphes ci-dessus, on constate que pour la dimension 1, les variables qui contribuent le plus sont celles associées aux valeurs extrêmes de la densité de Vélib', et ce quels que soient les horaires. Pour la dimension 2, il s'agit des variables "du matin", là encore pour les valeurs extrêmes du remplissage. Ainsi, dans notre cas, les variables qui contribuent le plus à l'ACM sont celles qui sont le mieux représentées.

In [None]:
fviz_mca_var(res.mca, col.var = "contrib",
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE)

Le graphe des variables colorié en fonction de la contribution des points permet d'avoir une confirmation visuelle des affirmations précédentes.

## Habillage en fonction des variables supplémentaires : bonus et localisation

In [None]:
res.mca = MCA(loading_heure_factor, quali.sup=8:9, graph=FALSE)

In [None]:
fviz_mca_var(res.mca)

Le graphe des variables semble indiquer qu'un bonus de 1 est plutôt corrélé à de faibles valeurs de la densité de Vélib'. Concrètement, cela signifie que les stations situées en altitude ont tendance à être plus souvent vides que les autres. Ceci confirme ce que nous avons vu dans les méthodes de clustering : ces stations sont plus difficiles d'accès donc moins sollicitées par les usagers.

Quant à la variable localisation, il semblerait que les stations proches de la Seine soient plus souvent remplies. Là encore, cela corroborre nos propos de l'analyse exploratoire et des méthodes de clustering : la Seine regroupe des zones de travail, d'activités, de transitions dans les modes de transport... Ce qui nécessite régulièrement l'usage des Vélib', notamment pour de courts trajets.

In [None]:
fviz_mca_ind(res.mca, label = "none", col.ind = coord$couleurs1, palette = c("yellow","blue"), title = "Graphe des individus - ACM")

fviz_mca_ind(res.mca,label = "none", col.ind = coord$couleurs2, palette = c("green","red"), title = "Graphe des individus - ACM")

Pour localisation, on constate que les stations proches de la Seine sont positivement corrélées à la première dimension qui, on le rappelle, est associée aux plus fortes valeurs de densité de Vélib' (>0.5). Ceci confirme donc ce que l'on voit avec le graphe des variables.

En revanche, il est difficle d'émettre la moindre conclusion pour la variable bonus (du fait notamment du faible effectif de la modalité "Colline").

In [None]:
res.mca$quali.sup$cos2

In [None]:
fviz_mca_var(res.mca, choice="mca.cor", repel=TRUE)

En afffichant la qualité de représentation des variables supplémentaires, on constate que localisation et bonus sont assez mal représentées par notre ACM (elles sont très peu corrélées à la première dimension et ne le sont pas du tout avec la seconde). Ceci explique en partie la difficulté de tirer de l'information du graphe des individus avec l'habillage en fonction de l'altitude (bonus). La qualité de représentation n'est donc en théorie pas suffisante pour pouvoir s'assurer de la validité de nos hypothèses sur localisation et bonus. Malgré tout, puisque ces dernières vont dans le même sens que les observations des sections précédentes, cela laisse à penser qu'elles sont plutôt cohérentes et intéressantes.

# Conclusion

L'analyse exploratoire, les méthodes de clustering et l'Analyse en Correspondances Multiples ont toutes permis d'identifier les mêmes comportements récurrents chez les utilisateurs de Vélib' à Paris.

En effet, les habitudes des usagers sont d'une part influencées par les heures et les jours de la semaine avec principalement une discrimination heures d'activités/ heures de repos et une autre jours du lundi au vendredi / weekend.

D'autre part, la densité de Vélib' est dépendante de critères géographiques : stations proches de la Seine ou non, stations en altitude, etc.

En utilisant les données recueillies sur ces facteurs pour une station précise, il est possible d'anticiper sa densité de Vélib' à un moment spécifique de la semaine. Cela est particulièrement pratique pour la compagnie en charge de la maintenance de ces modes de transport : anticiper le comportement des usagers permet de réguler plus facilement la densité de Vélib' dans la ville (stations ayant besoin d'être réapprovisionnées à certains moments par exemple) et de fait d'avoir plus de contrôle.