
# Pandasをインポート

In [1]:
import pandas as pd

# 銀行のマーケティングデータをインポート

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# X（特徴量）とy（ターゲットデータを統合）

In [4]:
# axis=1は列方向につなげる
# axis=0：行（row）方向 → 縦に結合
# axis=1：列（column）方向 → 横に結合

df = pd.concat([X, y], axis=1)

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


# GROUP BYで集計

In [12]:
df.groupby('job').size()

Unnamed: 0_level_0,0
job,Unnamed: 1_level_1
admin.,5171
blue-collar,9732
entrepreneur,1487
housemaid,1240
management,9458
retired,2264
self-employed,1579
services,4154
student,938
technician,7597


In [14]:
df['job'].value_counts()

Unnamed: 0_level_0,count
job,Unnamed: 1_level_1
blue-collar,9732
management,9458
technician,7597
admin.,5171
services,4154
retired,2264
self-employed,1579
entrepreneur,1487
unemployed,1303
housemaid,1240


In [15]:
# pandas の Series オブジェクトには .size() メソッドが存在しないのでエラーに
df['job'].size()

TypeError: 'int' object is not callable

In [17]:
df_crosstab = pd.crosstab(df['job'], df['y'])
df_crosstab

y,no,yes
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,4540,631
blue-collar,9024,708
entrepreneur,1364,123
housemaid,1131,109
management,8157,1301
retired,1748,516
self-employed,1392,187
services,3785,369
student,669,269
technician,6757,840


# 特定の項目値をクロス集計

In [19]:
df_pivot = df.pivot_table(index='job', columns='education', aggfunc='size')
df_pivot

education,primary,secondary,tertiary
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
admin.,209,4219,572
blue-collar,3758,5371,149
entrepreneur,183,542,686
housemaid,627,395,173
management,294,1121,7801
retired,795,984,366
self-employed,130,577,833
services,345,3457,202
student,44,508,223
technician,158,5229,1968
