In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data/bank-full.csv')
df.info()

In [None]:
df_count = len(df)
df.head()

In [None]:
name='age'
display_name='Age'
series = df[name]

# range
min_value = series.min()
max_value = series.max()
print('Min age: ', min_value)
print('Max age: ', max_value)
print('Null Values: ', series.isnull().any())

In [None]:
# draw a histogram of 'age' values
fig, ax = plt.subplots(figsize = (20, 8))
sns.countplot(series)
ax.set_title(display_name + ' Distribution', fontsize=15)
sns.despine()

In [None]:
# range of a categorical variable
print(df['job'].unique())

In [None]:
# a histogram for a categorical feature
fig, ax = plt.subplots(figsize = (13, 5))
sns.countplot(df['job'], ax = ax)
sns.despine(ax = ax)
ax.set_title('Job Distribution', fontsize=15)

In [None]:
# quantiles
Q1 = np.quantile(series, .25)
Q2 = np.quantile(series, .50)
Q3 = np.quantile(series, .75)
'Quantiles : 25 % : {0}, 50 % : {1}, 75 % : {2}'.format(Q1,Q2,Q3)

In [None]:
# boxplot and kernel density estimate
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (13, 5))
# the whiskers mark the values of the Q3/Q1 quartile +/- 1.5 IQR
sns.boxplot(series , orient = 'v', ax = ax1)
ax1.set_ylabel(display_name, fontsize=15)
ax1.tick_params(labelsize=15)

sns.distplot(series, ax = ax2)
sns.despine(ax = ax2)
ax2.set_xlabel(display_name, fontsize=15)
ax2.set_ylabel('Percentage', fontsize=15)
ax2.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)

In [None]:
# outliers
IQR = Q3 - Q1
upper_limit = min(Q3 + 1.5*IQR, max_value)
lower_limit = max(Q1 - 1.5*IQR, min_value)
# this range corresponds to whiskers in the boxplots above
print('Outlier Range : [{0},{1}]'.format(lower_limit, upper_limit))

outlier_count = series[(series > upper_limit) | (series < lower_limit)].count()
print('Number of outliers: {0}'.format(outlier_count))

#Outliers in %
print('PercentageOutliers are: {0} %'.format(round(outlier_count*100/df_count,2)))

In [None]:
# a single scatter plot
sns.pairplot(df, x_vars=['age'], y_vars=['job'], hue="y", height=10)     