# Pandasをインポート

In [24]:
import pandas as pd

# 銀行のマーケティングデータをインポート

In [25]:
pip install ucimlrepo



In [26]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# Pandasにインポート

In [27]:
df = pd.DataFrame(X)

# 基本の統計量を確認

In [28]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [29]:
df.describe()

Unnamed: 0,age,balance,day_of_week,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [30]:
df.isnull()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True
3,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True
4,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
45207,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
45208,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
45209,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [31]:
df.isnull().sum()

Unnamed: 0,0
age,0
job,288
marital,0
education,1857
default,0
balance,0
housing,0
loan,0
contact,13020
day_of_week,0


# X（特徴量）とy（ターゲットデータを統合）

In [32]:
# axis=1は列方向につなげる
# axis=0：行（row）方向 → 縦に結合
# axis=1：列（column）方向 → 横に結合

df = pd.concat([X, y], axis=1)

# 項目名を差し替える

In [33]:
columns = [
    '年齢', '職業', '婚姻', '学歴', '債務不履行', '平均残高',
    '住宅ローン', '個人ローン', '連絡手段', '最終通話日',
    '最終通話月', '最終通話秒数', '通話回数_販促中',
    '前回販促後_経過日数', '通話回数_販促前', '前回販促結果',
    '今回販促結果'
]
df2 = df.copy()
df2.columns = columns

# 列を入れ替える

In [34]:
columns1 = list(df2.columns)
print(columns1)

['年齢', '職業', '婚姻', '学歴', '債務不履行', '平均残高', '住宅ローン', '個人ローン', '連絡手段', '最終通話日', '最終通話月', '最終通話秒数', '通話回数_販促中', '前回販促後_経過日数', '通話回数_販促前', '前回販促結果', '今回販促結果']


In [35]:
columns2 = columns1[-1:] + columns[:-9]
print(columns2)

['今回販促結果', '年齢', '職業', '婚姻', '学歴', '債務不履行', '平均残高', '住宅ローン', '個人ローン']


In [39]:
columns = ['A', 'B', 'C', 'D', 'E']

In [40]:
print(columns[:])

['A', 'B', 'C', 'D', 'E']


In [41]:
columns[2:]

['C', 'D', 'E']

In [42]:
columns[:2]

['A', 'B']

In [43]:
columns[-2:]

['D', 'E']

In [44]:
columns[:-2]

['A', 'B', 'C']

# もう一度まとめる

In [45]:
df2 = df2[columns2]

In [46]:
df2.head()

Unnamed: 0,今回販促結果,年齢,職業,婚姻,学歴,債務不履行,平均残高,住宅ローン,個人ローン
0,no,58,management,married,tertiary,no,2143,yes,no
1,no,44,technician,single,secondary,no,29,yes,no
2,no,33,entrepreneur,married,secondary,no,2,yes,yes
3,no,47,blue-collar,married,,no,1506,yes,no
4,no,33,,single,,no,1,no,no
