In [1]:
import pandas as pd
import numpy as np
from pomegranate import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from matplotlib import pylab as plt
from IPython.display import Image

In [102]:
df = pd.read_parquet('merged_data')

In [103]:
data = df[[col for col in df.columns if col not in ['Description', 'Name', 'PetID', 'RescuerID']]]

In [104]:
data.columns

Index(['AdoptionSpeed', 'Age', 'Dewormed', 'Fee', 'FurLength', 'Gender',
       'Health', 'MaturitySize', 'PhotoAmt', 'Quantity', 'Sterilized', 'Type',
       'Vaccinated', 'VideoAmt', 'is_train', 'ColorName1', 'ColorName2',
       'ColorName3', 'BreedName1', 'BreedName2', 'StateName'],
      dtype='object')

In [105]:
#bayesian net은 이산형 변수만 다루어서 연속형 변수를 binning 하는 작업이 필요하다
#quantile 정보를 활용하는게 쉬운 방법

In [106]:
data.Age.describe()

count    18941.000000
mean        10.683966
std         18.247672
min          0.000000
25%          2.000000
50%          3.000000
75%         12.000000
max        255.000000
Name: Age, dtype: float64

In [107]:
data.Fee.describe()

count    18941.000000
mean        22.528589
std         80.637133
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       3000.000000
Name: Fee, dtype: float64

In [108]:
data.PhotoAmt.describe()

count    18941.000000
mean         3.872604
std          3.517535
min          0.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         30.000000
Name: PhotoAmt, dtype: float64

In [109]:
data.VideoAmt.describe()

count    18941.000000
mean         0.058022
std          0.356063
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          9.000000
Name: VideoAmt, dtype: float64

In [110]:
bins_age = [-np.Inf, 2,3,12,255]
bins_photoAmt = [-np.Inf, 2, 3, 5, 10, 20, 30]
bins_videoAmt = [-np.Inf, 2, 3, 5, 10, 20, 30]

In [111]:
data = data.assign(Age=pd.cut(data.Age, bins_age, labels=['infancy', 'young', 'old', 'very old']))

In [112]:
data = data.assign(PhotoAmt = pd.cut(data.PhotoAmt, bins_photoAmt))


In [113]:
data = data.assign(Fee = data.Fee > 0)

In [114]:
data.head()

Unnamed: 0_level_0,AdoptionSpeed,Age,Dewormed,Fee,FurLength,Gender,Health,MaturitySize,PhotoAmt,Quantity,...,Type,Vaccinated,VideoAmt,is_train,ColorName1,ColorName2,ColorName3,BreedName1,BreedName2,StateName
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,young,No,True,1,Male,1,1,"(-inf, 2.0]",1,...,Cat,No,0,True,Black,White,,Tabby,,Selangor
1,0,infancy,Not Sure,False,2,Male,1,2,"(-inf, 2.0]",1,...,Cat,Not Sure,0,True,Black,Brown,,Domestic Medium Hair,,Kuala Lumpur
2,3,infancy,Yes,False,2,Male,1,2,"(5.0, 10.0]",1,...,Dog,Yes,0,True,Brown,White,,Mixed Breed,,Selangor
3,2,old,Yes,True,1,Female,1,2,"(5.0, 10.0]",1,...,Dog,Yes,0,True,Black,Brown,,Mixed Breed,,Kuala Lumpur
4,2,infancy,No,False,1,Male,1,2,"(2.0, 3.0]",1,...,Dog,No,0,True,Black,,,Mixed Breed,,Selangor


In [115]:
from collections import defaultdict
d = defaultdict(LabelEncoder)

In [116]:
data = data.assign(ColorName1 = data.ColorName1.astype(np.str))
data = data.assign(ColorName2 = data.ColorName2.astype(np.str))
data = data.assign(ColorName3 = data.ColorName3.astype(np.str))
data = data.assign(BreedName1 = data.BreedName1.astype(np.str))
data = data.assign(BreedName2 = data.BreedName2.astype(np.str))

In [117]:
fit = data.select_dtypes(exclude=['int']).apply(lambda x: d[x.name].fit_transform(x))

In [118]:
data_prepared = pd.concat([fit, data.select_dtypes(include=['int'])], axis=1)

In [119]:
data_prepared.head()

Unnamed: 0_level_0,Age,Dewormed,Fee,Gender,PhotoAmt,Sterilized,Type,Vaccinated,is_train,ColorName1,...,ColorName3,BreedName1,BreedName2,StateName,AdoptionSpeed,FurLength,Health,MaturitySize,Quantity,VideoAmt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,0,1,1,0,0,0,0,1,0,...,3,168,89,12,2,1,1,1,1,0
1,0,1,0,1,0,1,0,1,1,0,...,3,61,89,3,0,2,1,2,1,0
2,0,2,0,1,3,0,1,2,1,1,...,3,114,89,12,3,2,1,2,1,0
3,1,2,1,0,3,0,1,2,1,0,...,3,114,89,3,2,1,1,2,1,0
4,0,0,0,1,1,0,1,0,1,0,...,3,114,89,12,2,1,1,2,1,0


In [120]:
data_train = data_prepared.loc[data_prepared.is_train == 1]
data_test = data_prepared.loc[~(data_prepared.is_train == 1)]


In [121]:
y = data_train.AdoptionSpeed
X = data_train.drop(columns=['AdoptionSpeed'])

In [122]:
data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [123]:
data_train = pd.concat([data_train, y_train], axis=1)

In [124]:
data_test = pd.concat([data_test, y_test], axis=1)

In [125]:
data_train.drop(columns=['is_train'], inplace=True)
data_test.drop(columns=['is_train'], inplace=True)

In [126]:
X = data_train.values

In [127]:
import pygraphviz

In [128]:
#model = BayesianNetwork.from_samples(X)
model = BayesianNetwork.from_samples(X, algorithm='chow-liu')


In [129]:
#model.plot("BNmodel.png")
model.structure

((),
 (7,),
 (11,),
 (17,),
 (11,),
 (0,),
 (11,),
 (5,),
 (9,),
 (11,),
 (9,),
 (0,),
 (11,),
 (11,),
 (11,),
 (11,),
 (11,),
 (10,),
 (4,),
 (11,))

In [141]:
model.plot("BNmodel.png")

ValueError: must have pygraphviz installed for visualization

In [44]:
Image("BNmodel.png")


TypeError: a bytes-like object is required, not 'str'

TypeError: a bytes-like object is required, not 'str'

<IPython.core.display.Image object>

In [131]:
data_train.columns[4]

'PhotoAmt'

In [132]:
data_test = data_test.assign(AdoptionSpeed = None)

In [139]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4948 entries, 13408 to 655
Data columns (total 20 columns):
Age              4948 non-null int64
Dewormed         4948 non-null int64
Fee              4948 non-null int64
Gender           4948 non-null int64
PhotoAmt         4948 non-null int64
Sterilized       4948 non-null int64
Type             4948 non-null int64
Vaccinated       4948 non-null int64
ColorName1       4948 non-null int64
ColorName2       4948 non-null int64
ColorName3       4948 non-null int64
BreedName1       4948 non-null int64
BreedName2       4948 non-null int64
StateName        4948 non-null int64
FurLength        4948 non-null int64
Health           4948 non-null int64
MaturitySize     4948 non-null int64
Quantity         4948 non-null int64
VideoAmt         4948 non-null int64
AdoptionSpeed    0 non-null object
dtypes: int64(19), object(1)
memory usage: 971.8+ KB


In [135]:
data_test.iloc[0,:].values


array([2, 0, 0, 0, 0, 2, 1, 2, 1, 5, 3, 114, 89, 12, 2, 1, 2, 1, 0, None],
      dtype=object)

In [140]:
result = model.predict(data_test.iloc[0,:].values)

TypeError: object of type 'numpy.int64' has no len()