# Pandasをインポート

In [14]:
import pandas as pd

# 銀行のマーケティングデータをインポート

In [15]:
pip install ucimlrepo



In [16]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# X（特徴量）とy（ターゲットデータを統合）

In [17]:
# axis=1は列方向につなげる
# axis=0：行（row）方向 → 縦に結合
# axis=1：列（column）方向 → 横に結合

df = pd.concat([X, y], axis=1)

In [25]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


# GROUP BYで集計

In [29]:
df_count = df.groupby('education').size()
display(df_count)

Unnamed: 0_level_0,0
education,Unnamed: 1_level_1
primary,6851
secondary,23202
tertiary,13301


In [30]:
df_grl = df.groupby('education').mean(numeric_only=True)
display(df_grl)

Unnamed: 0_level_0,age,balance,day_of_week,duration,campaign,pdays,previous
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
primary,45.865567,1250.949934,15.421398,255.933002,2.834331,36.08174,0.489272
secondary,39.96427,1154.880786,15.759159,258.68576,2.700802,42.353504,0.567753
tertiary,39.59364,1758.416435,16.086535,258.518532,2.825577,39.024134,0.66198


In [31]:
df_stats = df.groupby('education').agg({
    'age': ['mean', 'min', 'max'],
    'balance': ['mean', 'std'],
    'duration': 'mean'
})
display(df_stats)

Unnamed: 0_level_0,age,age,age,balance,balance,duration
Unnamed: 0_level_1,mean,min,max,mean,std,mean
education,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
primary,45.865567,18,95,1250.949934,2690.743991,255.933002
secondary,39.96427,18,95,1154.880786,2558.256739,258.68576
tertiary,39.59364,20,89,1758.416435,3839.088305,258.518532
