# ANDO: NEW YORK AIRBNB Dataset

## Importation des libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Scikit-learn pour la préparation, la réduction de dimension et le clustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

import warnings	
warnings.filterwarnings('ignore')

## Lecture du dataset

In [None]:
df = pd.read_csv('AB_NYC_2019.csv')
df.head()

## Tache 1: Nettoyage et preparation

In [None]:
# 1. Remplacer les valeurs manquantes dans reviews_per_month
df['reviews_per_month'].fillna(0, inplace=True)
print("Les valeurs manquantes de reviews_per_month ont été remplacées par 0.")

# 2. Supprimer les colonnes inutiles
to_drop = ['name', 'id', 'last_review']
df.drop(columns=to_drop, inplace=True)
print("Les colonnes name, id et last_review ont été supprimées.")

# 3. Supprimer les valeurs extrêmes dans price avec le 99e percentile
price_cap = df['price'].quantile(0.99)
df = df[df['price'] < price_cap]
df = df[df['price'] > 0]
print(f"Les valeurs extrêmes de price au-dessus de {price_cap} ont été retirées.")

# 4. Créer une nouvelle variable log_price
df['log_price'] = np.log1p(df['price'])
df['log_price'] = df['log_price'].round(2)
print("La variable log_price a été créée à partir de price.")

# 5. Créer les types d'hôtes selon le nombre d'annonces
df['host_type'] = 0
df.loc[df['calculated_host_listings_count'] == 1, 'host_type'] = 0        # Occasionnel
df.loc[(df['calculated_host_listings_count'] >= 2) 
       & (df['calculated_host_listings_count'] <= 5), 'host_type'] = 1    # Multi
df.loc[df['calculated_host_listings_count'] > 5, 'host_type'] = 2         # Opérateur

print("La variable host_type a été créée selon le volume d'annonces.")

# 6. Copier le dataset final
df_cleaned = df.copy()
print("Nettoyage terminé. Le DataFrame propre est prêt pour l'analyse.")
df_cleaned.head()

## Tache 2:  ANALYSE EXPLORATOIRE ET TESTS D'HYPOTHÈSES

In [None]:
# 1 general info
print("DataFrame Info:")
display(df_cleaned.info())

# 2 missing values
print("\nMissing percentile Values in Each Column:")
display(df_cleaned.isnull().sum()/df_cleaned.shape[0] * 100)

# 3 statistical summary
print("\nStatistical Summary of Numerical Columns:")
display(df_cleaned.describe())

# statistical summary categorical
display(df.describe(exclude=['number']))

In [None]:
# imputer host_name 
df['host_name'].fillna('Unknown', inplace=True)

In [None]:
# 1 price distribution
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(df_cleaned['price'], kde=True, bins=50)
plt.title('Distribution des Prix (Brute)')
# 2 log_price distribution
plt.subplot(1, 2, 2)
sns.histplot(df_cleaned['log_price'], kde=True, bins=50)
plt.title('Distribution du Log-Prix (plus proche de la loi Normale)')
plt.show()

#  3 Bivariate Analysis: Boxplot of log_price by neighbourhood_group
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_cleaned, x='neighbourhood_group', y='log_price')
plt.title('Boxplot du Log-Prix par Arrondissement')
plt.show()

# 4 Bivariate Analysis: Boxplot of log_price by room_type
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_cleaned, x='room_type', y='log_price')
plt.title('Boxplot du Log-Prix par type de logement')
plt.show()


# 5 collation matrix
numerical_cols = ['log_price', 'minimum_nights', 'number_of_reviews', 
                  'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'host_type']
corr_matrix = df_cleaned[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

### Teste d'hypothese

In [None]:
# Hypothesis testing: ANOVA test for log_price across neighbourhood_group
# 1 ANOVA test
groups = [group['log_price'].values for name, group in df_cleaned.groupby('neighbourhood_group')]
f_stat, p_value = stats.f_oneway(*groups)
print(f"ANOVA test results for log_price across neighbourhood_group: F-statistic = {f_stat}, p-value = {p_value}")
if p_value < 0.05:
	print("Reject the null hypothesis: Significant differences exist between groups.")	
else:
	print("Fail to reject the null hypothesis: No significant differences between groups.")
 
 # 2 t-test for log_price between two room_types: 'Shared room' and 'Private room'
entire_home = df_cleaned[df_cleaned['room_type'] == 'Shared room']['log_price']
private_room = df_cleaned[df_cleaned['room_type'] == 'Private room']['log_price']
t_stat, p_value = stats.ttest_ind(entire_home, private_room, equal_var=False)
print(f"T-test results for log_price between 'Shared room' and 'Private room': T-statistic = {t_stat}, p-value = {p_value}")
if p_value < 0.05:		
	print("Reject the null hypothesis: Significant difference in log_price between the two room types.")	
else:
	print("Fail to reject the null hypothesis: No significant difference in log_price between the two room types.")

# 3 Chi-squared test for independence between neighbourhood_group and room_type
contingency_table = pd.crosstab(df_cleaned['neighbourhood_group'], df_cleaned['room_type'])
chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-squared test results between neighbourhood_group and room_type: Chi2-statistic = {chi2_stat}, p-value = {p_value}")
if p_value < 0.05:
	print("Reject the null hypothesis: Variables are dependent.")	
else:
	print("Fail to reject the null hypothesis: Variables are independent.")
 
# 4. Tests d'Hypothèses
# Test 1: T-test de Student - Le prix moyen à Manhattan est-il différent de celui de Brooklyn ?
manhattan_prices = df_cleaned[df_cleaned['neighbourhood_group'] == 'Queens']['log_price']
brooklyn_prices = df_cleaned[df_cleaned['neighbourhood_group'] == 'Brooklyn']['log_price']
t_stat, p_value = stats.ttest_ind(manhattan_prices, brooklyn_prices, equal_var=False)
print(f"--- T-test (Manhattan vs Brooklyn) ---")
print(f"P-valeur: {p_value:.5f}")
if p_value < 0.05:
    print("Conclusion : On rejette l'hypothèse nulle. Les prix moyens sont significativement différents.")
else:
    print("Conclusion : On ne peut pas rejeter l'hypothèse nulle.") 

# 5 Test 2: Test du Chi-carré - Y a-t-il une dépendance entre le type de chambre et l'arrondissement ?
contingency_table = pd.crosstab(df_cleaned['neighbourhood_group'], df_cleaned['room_type'])
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
print(f"\n--- Test du Chi-carré (Arrondissement vs Type de chambre) ---")
print(f"P-valeur: {p_value:.5f}")
if p_value < 0.05:
    print("Conclusion : On rejette l'hypothèse nulle. Il y a une association significative entre les variables.")
else:
    print("Conclusion : On ne peut pas rejeter l'hypothèse nulle.")
display(contingency_table)

# Post-hoc analysis: Tukey's HSD test for neighbourhood_group 
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(df_cleaned['log_price'], df_cleaned['neighbourhood_group'])
print(tukey)

# Post-hoc analysis: Tukey's HSD test for room_type 
tukey_room = pairwise_tukeyhsd(df_cleaned['log_price'], df_cleaned['room_type'])
print(tukey_room)

In [None]:
df_room_type_neighbourhood = df.room_type.groupby(df.neighbourhood_group).value_counts().unstack()
df_room_type_neighbourhood.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Répartition des types de logement par arrondissement')
plt.xlabel('Arrondissement')
plt.ylabel('Nombre de logements')
plt.legend(title='Type de logement')
plt.show()

## Tache 3: PCA et CCA

In [None]:
# PARTIE 4 : RÉDUCTION DE DIMENSION (Leçons 3-4-5) - CORRIGÉ #2
print("\n--- Étape 4: Réduction de Dimension ---")

# 1. Analyse en Composantes Principales (ACP) sur les données numériques
print("--- Lancement de l'ACP ---")
features_for_pca = ['log_price', 'minimum_nights', 'number_of_reviews',
                      'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

# Centrer et réduire les données est CRUCIAL pour l'ACP
X = df_cleaned[features_for_pca].values
X_scaled = StandardScaler().fit_transform(X)

# Application de l'ACP
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Ajout des informations qualitatives pour la visualisation
df_pca.index = df_cleaned.index
df_pca['host_type'] = df_cleaned['host_type']

# Visualisation des individus sur le premier plan factoriel
plt.figure(figsize=(12, 9))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='host_type', alpha=0.5)
plt.title('Projection des Annonces sur les 2 Premières Composantes Principales (ACP)')
plt.xlabel(f'PC1 - {pca.explained_variance_ratio_[0]*100:.2f}% variance')
plt.ylabel(f'PC2 - {pca.explained_variance_ratio_[1]*100:.2f}% variance')
plt.grid(True)
plt.show()

# Cercle des corrélations (Interprétation des axes)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig, ax = plt.subplots(figsize=(8, 8))
for i, feature in enumerate(features_for_pca):
    ax.arrow(0, 0, loadings[i,0], loadings[i,1], color='r', alpha=0.8, head_width=0.02)
    ax.text(loadings[i,0]*1.15, loadings[i,1]*1.15, feature, color='black')
circle = plt.Circle((0,0), 1, color='gray', fill=False, linestyle='--')
ax.add_artist(circle)
ax.axhline(0, color='black', linewidth=0.5)
ax.axvline(0, color='black', linewidth=0.5)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Cercle des Corrélations')
ax.set_aspect('equal', adjustable='box')
ax.grid(True)
plt.show()

# 2. Analyse des Correspondances Multiples (ACM) sur les données catégorielles
print("\n--- Lancement de l'ACM ---")
df_cleaned['price_category'] = pd.qcut(df_cleaned['price'], q=3, labels=['Bas', 'Moyen', 'Élevé'])
features_for_mca = ['neighbourhood_group', 'room_type', 'price_category']
df_mca_input = df_cleaned[features_for_mca]

'''mca = prince.MCA(n_components=2, n_iter=3, random_state=42)
mca = mca.fit(df_mca_input)

# CORRECTION #2 : La librairie prince utilise Altair. La syntaxe pour le titre change.
chart = mca.plot(df_mca_input)
chart = chart.properties(
    title='Projection des Modalités sur le Premier Plan Factoriel (ACM)'
)
ax = mca.plot(df_mca_input)
plt.show()'''

## Tâche 4: Clustering et segmentation des hôtes

In [None]:
# PARTIE 5 : CLUSTERING (Leçon 6)
print("\n--- Étape 5: Clustering ---")

# 1. K-Means sur les composantes principales de l'ACP
print("--- Lancement du K-Means ---")
# On utilise les données de l'ACP pour le clustering
X_for_kmeans = df_pca[['PC1', 'PC2']].values

# Détermination du nombre optimal de clusters (Méthode du coude)
inertia = []
K_range = range(2, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_for_kmeans)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Nombre de clusters (k)')
plt.ylabel('Inertie')
plt.title('Méthode du Coude pour déterminer k optimal')
plt.show()

# On choisit k=4 (un coude visible)
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
df_pca['kmeans_cluster'] = kmeans.fit_predict(X_for_kmeans)
print(f"Clustering K-Means avec k={k_optimal} terminé.")

# Visualisation des clusters K-Means
plt.figure(figsize=(12, 9))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='kmeans_cluster', palette='viridis', alpha=0.7)
plt.title('Clusters K-Means visualisés sur le plan de l\'ACP')
plt.show()

# Interprétation des clusters
cluster_profile = df_cleaned.join(df_pca['kmeans_cluster'])
print("Profil moyen des clusters (sur données non-standardisées) :")
print(cluster_profile.groupby('kmeans_cluster')[numerical_cols].mean())


# 2. DBSCAN pour le clustering spatial
print("\n--- Lancement du DBSCAN ---")
# On prend un échantillon pour des raisons de performance
df_sample_spatial = df_cleaned.sample(n=20000, random_state=42)
coords = df_sample_spatial[['latitude', 'longitude']].values

# Les paramètres eps et min_samples sont cruciaux et nécessitent des tests.
# eps=0.01 (environ 1km à NYC), min_samples=50
dbscan = DBSCAN(eps=0.01, min_samples=50)
df_sample_spatial['dbscan_cluster'] = dbscan.fit_predict(coords)
print("Clustering spatial DBSCAN terminé.")
print(f"Nombre de clusters trouvés : {df_sample_spatial['dbscan_cluster'].nunique() - 1}")
print(f"Nombre de points 'bruit' (outliers) : {(df_sample_spatial['dbscan_cluster'] == -1).sum()}")

In [None]:
import geopandas as gpd
import contextily as ctx

# Visualisation des clusters DBSCAN
plt.figure(figsize=(12, 12))
sns.scatterplot(data=df_sample_spatial, x='longitude', y='latitude', 
                hue='host_type', palette='deep', s=10, 
                legend='full')
plt.title('Clusters Spatiaux identifiés par DBSCAN')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Cluster ID')
plt.show()

gdf_spatial = gpd.GeoDataFrame(
    df_sample_spatial,
    geometry=gpd.points_from_xy(df_sample_spatial.longitude, df_sample_spatial.latitude),
    crs="EPSG:4326"
).to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(12, 12))
gdf_spatial.plot(
    ax=ax,
    column='dbscan_cluster',
    categorical=True,
    cmap='viridis',
    markersize= 5,
    alpha=0.7,
    legend=True
)

ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
ax.set_axis_off()
plt.title('DBSCAN Spatial Clusters on NYC Map')
plt.show()

## Tâche 5. Analyse spatiale et modèle explicatif
