In [18]:
import pandas as pd
import seaborn as sns
import numpy as np
from ydata_profiling import ProfileReport
import plotly.graph_objs as go
import plotly.subplots as sp

In [19]:
# figure settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set()
sns.set_theme(style="whitegrid")

# set seed
np.random.seed(7)

## Import data
---
Import and transform the dataset to pandas.

In [22]:
# load
x_train = pd.read_csv("../../data/raw/train_values.csv")
y_train = pd.read_csv("../../data/raw/train_labels.csv")

In [23]:
# join for analysis
train = pd.merge(x_train, y_train, on="building_id")

In [24]:
# check data types
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 40 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [25]:
# data preview
train.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


**Findings:**
* The data set only consits of features of type int64 or object. Object representing categorical data. This as to be considered in the future computation. Therefore the data set is split in numeric features and categorical features only.
* Refering to the data [documentation](https://www.drivendata.org/competitions/57/nepal-earthquake/page/136/), the feaures of type dtype object also include boolean features labeled via 0 and 1. These are also included in the categorical analysis in this notebook.

In [27]:
# numeric data
train_numeric = train.select_dtypes(include=np.number)
numeric_cols = train_numeric.columns.tolist()
# categorical data
train_cat = train.select_dtypes(include='object')
cat_cols = train_cat.columns.tolist()

## Missing values
---
Check the data for missing values. If some are present, find the corresponding vulnerable features.

In [28]:
# check if any missing
train.isna().values.any()

False

In [29]:
# absolute and relative frequency of missing values
temp = pd.DataFrame()
temp["absolute"] = train.isna().sum()
temp["relative (%)"] = train.isna().sum()/(len(train))*100

In [30]:
print(f"Column with lowest amount of missings contains {temp['relative (%)'].min()} % missings.")
print(f"Column with highest amount of missings contains {temp['relative (%)'].max()} % missings.")
display(temp)

Column with lowest amount of missings contains 0.0 % missings.
Column with highest amount of missings contains 0.0 % missings.


Unnamed: 0,absolute,relative (%)
building_id,0,0.0
geo_level_1_id,0,0.0
geo_level_2_id,0,0.0
geo_level_3_id,0,0.0
count_floors_pre_eq,0,0.0
age,0,0.0
area_percentage,0,0.0
height_percentage,0,0.0
land_surface_condition,0,0.0
foundation_type,0,0.0


In [31]:
# determine columns with missing values
train.loc[:, train.isnull().any()].columns.tolist()

[]

**Findings:** There are no missing values! Good for us, no further adjustment needs to be done.

## Distribution
---
Use data visualization techniques to inspect the data’s distribution and verify the presence of imbalance, outliers or default values. This is done seperately for all numeric and categorical features.

In [None]:
train.describe().drop(["count", "25%", "50%", "75%"], axis=0)

In [None]:
%%time
profile = ProfileReport(train, title="Profiling Report")
#this is a html file and can be saved and opened in another tab
profile.to_file(output_file="train_pandas_profile.html")

### Distribution I: numeric features

In [None]:
df = train_numeric.drop(["building_id"])

In [None]:
# Define the number of columns and rows for the subplot grid
ncols = 2
nrows = (df.shape[1] // ncols) + (1 if df.shape[1] % ncols != 0 else 0)

# Create a list of subplot titles
subplot_titles = list(df.columns)

# Create the subplot figure
fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)

# Loop through each column in the dataframe and add a histogram trace to the corresponding subplot
for i, col in enumerate(df.columns):
    row = (i // ncols) + 1
    col = (i % ncols) + 1
    fig.add_trace(go.Histogram(x=df[col], name=col), row=row, col=col)
    fig.update_xaxes(title_text=col, row=row, col=col)
    fig.update_yaxes(title_text="Count", row=row, col=col)

# Update the layout and show the plot
fig.update_layout(title="Distribution of Columns", height=800)
fig.show()

In [None]:
# calculate outliers

### Distribution II: categorical features

**Findings:**