
# Pandasをインポート

In [14]:
import pandas as pd

# 銀行のマーケティングデータをインポート

In [15]:
pip install ucimlrepo



In [16]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# X（特徴量）とy（ターゲットデータを統合）

In [17]:
# axis=1は列方向につなげる
# axis=0：行（row）方向 → 縦に結合
# axis=1：列（column）方向 → 横に結合

df = pd.concat([X, y], axis=1)

In [21]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


# GROUP BYで集計

In [22]:
# education毎にcountする
df.groupby('education').size()

Unnamed: 0_level_0,0
education,Unnamed: 1_level_1
primary,6851
secondary,23202
tertiary,13301


In [23]:
# 平均を求める
# ただし文字列(str)と数値(int)が混じっているためエラーになる
df.groupby('education').mean()

TypeError: agg function failed [how->mean,dtype->object]

In [24]:
df.groupby('education').mean(numeric_only=True)

Unnamed: 0_level_0,age,balance,day_of_week,duration,campaign,pdays,previous
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
primary,45.865567,1250.949934,15.421398,255.933002,2.834331,36.08174,0.489272
secondary,39.96427,1154.880786,15.759159,258.68576,2.700802,42.353504,0.567753
tertiary,39.59364,1758.416435,16.086535,258.518532,2.825577,39.024134,0.66198


# 職業ごとに集計

In [25]:
# カウントする
df.groupby('job').size()

Unnamed: 0_level_0,0
job,Unnamed: 1_level_1
admin.,5171
blue-collar,9732
entrepreneur,1487
housemaid,1240
management,9458
retired,2264
self-employed,1579
services,4154
student,938
technician,7597


In [31]:
# 平均を求める
df.groupby('job').mean(numeric_only=True)

Unnamed: 0_level_0,age,balance,day_of_week,duration,campaign,pdays,previous
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
admin.,39.289886,1135.838909,15.564301,246.896732,2.575324,47.859021,0.67163
blue-collar,40.044081,1078.826654,15.442561,262.901562,2.816995,44.033498,0.505138
entrepreneur,42.190989,1521.470074,15.702085,256.309348,2.799597,32.486214,0.478144
housemaid,46.415323,1392.395161,16.002419,245.825,2.820968,21.505645,0.371774
management,40.449567,1763.616832,16.114189,253.995771,2.864348,38.665468,0.668006
retired,61.626767,1984.215106,15.439488,287.361307,2.346731,37.443905,0.638693
self-employed,40.484484,1647.970868,16.027866,268.157061,2.853072,34.747308,0.551615
services,38.74025,997.088108,15.635532,259.318729,2.718344,41.995185,0.501204
student,26.542644,1388.060768,14.897655,246.656716,2.299574,57.041578,0.953092
technician,39.314598,1252.632092,16.408582,252.904962,2.906805,37.195077,0.574569


In [34]:
# 職業ごとの平均年齢を求めたいけど、
# mean()の引数は「計算方法の設定」用だから違う

df.groupby('job').mean('age')

Unnamed: 0_level_0,age,balance,day_of_week,duration,campaign,pdays,previous
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
admin.,39.289886,1135.838909,15.564301,246.896732,2.575324,47.859021,0.67163
blue-collar,40.044081,1078.826654,15.442561,262.901562,2.816995,44.033498,0.505138
entrepreneur,42.190989,1521.470074,15.702085,256.309348,2.799597,32.486214,0.478144
housemaid,46.415323,1392.395161,16.002419,245.825,2.820968,21.505645,0.371774
management,40.449567,1763.616832,16.114189,253.995771,2.864348,38.665468,0.668006
retired,61.626767,1984.215106,15.439488,287.361307,2.346731,37.443905,0.638693
self-employed,40.484484,1647.970868,16.027866,268.157061,2.853072,34.747308,0.551615
services,38.74025,997.088108,15.635532,259.318729,2.718344,41.995185,0.501204
student,26.542644,1388.060768,14.897655,246.656716,2.299574,57.041578,0.953092
technician,39.314598,1252.632092,16.408582,252.904962,2.906805,37.195077,0.574569


In [36]:
# 列を先に選択してから、mean()しないといけない

df.groupby('job')['age'].mean()

Unnamed: 0_level_0,age
job,Unnamed: 1_level_1
admin.,39.289886
blue-collar,40.044081
entrepreneur,42.190989
housemaid,46.415323
management,40.449567
retired,61.626767
self-employed,40.484484
services,38.74025
student,26.542644
technician,39.314598


In [39]:
# 職業ごとに販促結果を計算
# 既に'job'を指定しているのでいらない
df.groupby('job')[['job', 'y']].mean()

TypeError: agg function failed [how->mean,dtype->object]

In [41]:
# 正しくはこう
df.groupby('job')['y'].mean()

TypeError: agg function failed [how->mean,dtype->object]

In [42]:
df['y'].dtype

dtype('O')

In [44]:
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
default,object
balance,int64
housing,object
loan,object
contact,object
day_of_week,int64


In [45]:
df['y'].replace({'yes': 1, 'no': 0}).groupby(df['job']).mean()

  df['y'].replace({'yes': 1, 'no': 0}).groupby(df['job']).mean()


Unnamed: 0_level_0,y
job,Unnamed: 1_level_1
admin.,0.122027
blue-collar,0.07275
entrepreneur,0.082717
housemaid,0.087903
management,0.137556
retired,0.227915
self-employed,0.118429
services,0.08883
student,0.28678
technician,0.11057
