In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [113]:
# load training data 
data_x_train = pd.read_csv("../data/raw/X_train_J01Z4CN.csv")
data_y_train = pd.read_csv("../data/raw/y_train_OXxrJt1.csv")


# check shape (matrix dimension)
print(f"shape xtrain : {data_x_train.shape}")
print(f"shape ytrain : {data_y_train.shape}")

# glance at data for x train
data_x_train.head()

shape xtrain : (37368, 27)
shape ytrain : (37368, 2)


Unnamed: 0,id_annonce,property_type,approximate_latitude,approximate_longitude,city,postal_code,size,floor,land_size,energy_performance_value,...,nb_parking_places,nb_boxes,nb_photos,has_a_balcony,nb_terraces,has_a_cellar,has_a_garage,has_air_conditioning,last_floor,upper_floors
0,35996577,appartement,43.64388,7.117183,villeneuve-loubet,6270,63.0,,,,...,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,35811033,appartement,45.695757,4.89561,venissieux,69200,90.0,3.0,,223.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,35731841,maison,47.966791,-1.220451,moutiers,35130,61.0,,370.0,,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35886765,maison,47.289292,-1.878805,cordemais,44360,142.0,,764.0,217.0,...,0.0,1.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,35781137,appartement,45.718992,4.844234,lyon-7eme,69007,88.0,3.0,,,...,0.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# glance data for y train
data_y_train.head()

Unnamed: 0,id_annonce,price
0,35996577,355000.0
1,35811033,190000.0
2,35731841,39000.0
3,35886765,299000.0
4,35781137,478000.0


In [117]:
# summary of dataframe
data_x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37368 entries, 0 to 37367
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id_annonce                   37368 non-null  int64  
 1   property_type                37368 non-null  object 
 2   approximate_latitude         37368 non-null  float64
 3   approximate_longitude        37368 non-null  float64
 4   city                         37368 non-null  object 
 5   postal_code                  37368 non-null  int64  
 6   size                         36856 non-null  float64
 7   floor                        9743 non-null   float64
 8   land_size                    15581 non-null  float64
 9   energy_performance_value     19068 non-null  float64
 10  energy_performance_category  19068 non-null  object 
 11  ghg_value                    18530 non-null  float64
 12  ghg_category                 18530 non-null  object 
 13  exposition      

In [118]:
# summary of dataframe 
data_y_train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37368 entries, 0 to 37367
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id_annonce  37368 non-null  int64  
 1   price       37368 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 584.0 KB


In [120]:
# check for duplicated values
print(f"Duplicated values in training set: {data_x_train.duplicated().sum()}")

Duplicated values in training set: 0


## Descriptive statistics for numerical variables

In [121]:
# select relevant features
numerical_features = ["size", "floor", "land_size", "energy_performance_value", "ghg_value", "nb_rooms", "nb_bedrooms", "nb_bathrooms", "nb_parking_places", "nb_boxes", "nb_photos"]
# compute statistics
summary_numerical_features = data_x_train[numerical_features].describe().T

summary_numerical_features

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
size,36856.0,1088.831615,5555.320867,1.0,74.0,115.0,235.25,411311.0
floor,9743.0,3.479524,6.725577,1.0,1.0,2.0,4.0,55.0
land_size,15581.0,3995.665362,54415.954149,1.0,362.0,795.0,1841.0,6203700.0
energy_performance_value,19068.0,205.385148,790.501769,0.0,125.0,180.0,239.0,100000.0
ghg_value,18530.0,31.845548,310.576209,0.0,8.0,16.0,36.0,17024.0
nb_rooms,35802.0,4.245405,2.987782,0.0,3.0,4.0,5.0,130.0
nb_bedrooms,34635.0,2.864877,2.156517,0.0,2.0,3.0,4.0,118.0
nb_bathrooms,24095.0,0.92073,0.270778,0.0,1.0,1.0,1.0,3.0
nb_parking_places,37368.0,0.293486,0.455365,0.0,0.0,0.0,1.0,1.0
nb_boxes,37368.0,0.179699,0.383942,0.0,0.0,0.0,0.0,1.0


## Descriptive statistics for categorical variables

In [124]:
# too many
#data_x_train["city"].value_counts()
data_x_train["city"].nunique()

8643

In [125]:
# select relevant object type features 
categorical_features = ["property_type", "energy_performance_category", "ghg_category", "exposition"]

# compute occurrences
for val in categorical_features:
    print(data_x_train[val].value_counts(),"\n\n")

property_type
appartement          15733
maison               15675
divers                2137
terrain               1534
villa                  781
propriété              408
terrain à bâtir        298
duplex                 220
viager                 177
ferme                  130
parking                 96
loft                    54
chalet                  48
château                 21
moulin                  17
manoir                  15
péniche                  8
hôtel particulier        6
chambre                  4
gîte                     3
atelier                  2
hôtel                    1
Name: count, dtype: int64 


energy_performance_category
D    7023
C    4113
E    3813
B    2138
F    1049
A     616
G     316
Name: count, dtype: int64 


ghg_category
B    4531
C    3421
D    3247
E    3008
A    2547
F    1271
G     505
Name: count, dtype: int64 


exposition
Sud           3239
Sud-Ouest     1395
Sud-Est        956
Est-Ouest      820
Ouest          807
Est            692

## Descriptive statistics for binary variables

In [126]:
# select boolean type features 

binary_features = ["has_a_balcony", "nb_terraces", "has_a_cellar", "has_a_garage", "has_air_conditioning", "last_floor", "upper_floors"]

# compute proportions
for val in binary_features:
    print(data_x_train[val].value_counts(normalize=True), "\n\n")


has_a_balcony
0.0    0.851718
1.0    0.148282
Name: proportion, dtype: float64 


nb_terraces
0.0    0.690404
1.0    0.309596
Name: proportion, dtype: float64 


has_a_cellar
0.0    0.799855
1.0    0.200145
Name: proportion, dtype: float64 


has_a_garage
0.0    0.94613
1.0    0.05387
Name: proportion, dtype: float64 


has_air_conditioning
0.0    0.960153
1.0    0.039847
Name: proportion, dtype: float64 


last_floor
0.0    0.996173
1.0    0.003827
Name: proportion, dtype: float64 


upper_floors
0.0    0.999759
1.0    0.000241
Name: proportion, dtype: float64 




# Data visualization