# IRIS 데이터를 이용한 예측 모델 실습

# 학습 데이터 세팅하기

In [4]:
import pandas as pd

In [5]:
iris = pd.read_csv('iris_nan_sample.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,,1.4,0.2,setosa


In [6]:
iris.isnull().sum()

sepal_length     0
sepal_width     28
petal_length     0
petal_width      0
species          0
dtype: int64

In [None]:
# 종속변수 : sepal_width
# 독립변수 : species와 sepal_width를 제외한 나머지 변수

In [7]:
iris_x = iris[['sepal_length', 'petal_length', 'petal_width']].copy()
iris_y = iris['sepal_width'].copy()

In [14]:
# 학습용 / 예측용
nan_idx = iris.index[iris['sepal_width'].isnull()]

In [15]:
not_nan_idx = iris.index[~iris['sepal_width'].isnull()]

In [22]:
train_x = iris_x.iloc[not_nan_idx]
train_y = iris_y.iloc[not_nan_idx]

pred_x = iris_x.iloc[nan_idx]

In [23]:
iris_y.iloc[nan_idx]

4     NaN
5     NaN
7     NaN
10    NaN
17    NaN
20    NaN
26    NaN
30    NaN
34    NaN
38    NaN
45    NaN
46    NaN
78    NaN
82    NaN
83    NaN
85    NaN
87    NaN
91    NaN
92    NaN
93    NaN
96    NaN
104   NaN
117   NaN
122   NaN
123   NaN
129   NaN
131   NaN
133   NaN
Name: sepal_width, dtype: float64

In [18]:
train_x = iris.iloc[not_nan_idx][['sepal_length', 'petal_length', 'petal_width']].copy()
train_y = iris.iloc[not_nan_idx]['sepal_width'].copy()

In [20]:
pred_x = iris.iloc[nan_idx][['sepal_length', 'petal_length', 'petal_width']].copy()
pred_x.shape

(28, 3)

## 특정 속성의 결측치 예측

- 사용데이터 : train_x, train_y, pred_x
- 예측 모델 : 특정 속성의 결측치 예측
    - LinearRegression
    - Decision Tree Regressor
    - RandomForestRegressor
    - XGBoostRegressor

- 결정계수, MSE 값 비교하여 가장 성능이 좋은 모델 확인
- {'linear_reg':[0000, 0000]}

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [26]:
linear_reg = LinearRegression()
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()

In [27]:
from sklearn.metrics import mean_squared_error

In [29]:
answer = pd.read_csv('iris_answer.csv')
answer.drop('Unnamed: 0', axis=1, inplace=True)

In [30]:
answer.head()

Unnamed: 0,answer
0,3.6
1,3.9
2,3.4
3,3.7
4,3.5


In [31]:
train_y

0      3.5
1      3.0
2      3.2
3      3.1
6      3.4
8      2.9
9      3.1
11     3.4
12     3.0
13     3.0
14     4.0
15     4.4
16     3.9
18     3.8
19     3.8
21     3.7
22     3.6
23     3.3
24     3.4
25     3.0
27     3.5
28     3.4
29     3.2
31     3.4
32     4.1
33     4.2
35     3.2
36     3.5
37     3.6
39     3.4
      ... 
114    2.8
115    3.2
116    3.0
118    2.6
119    2.2
120    3.2
121    2.8
124    3.3
125    3.2
126    2.8
127    3.0
128    2.8
130    2.8
132    2.8
134    2.6
135    3.0
136    3.4
137    3.1
138    3.0
139    3.1
140    3.1
141    3.1
142    2.7
143    3.2
144    3.3
145    3.0
146    2.5
147    3.0
148    3.4
149    3.0
Name: sepal_width, Length: 122, dtype: float64

In [33]:
true_y = answer['answer']

In [34]:
models = [linear_reg,dt_reg,rf_reg,xgb_reg]
result = {}
for m, l in zip(models, ['linear_reg','dt_reg','rf_reg','xgb_reg']):
    tmp = []
    m.fit(train_x, train_y)
    r2_score = m.score(train_x, train_y)
#     result[l] = [r2_score]
    pred_y = m.predict(pred_x)
    mse = mean_squared_error(true_y, pred_y)
#     result[l].append(mse)
    tmp.extend([r2_score, mse])
    result[l] = tmp



  if getattr(data, 'base', None) is not None and \


In [36]:
df = pd.DataFrame(result, index=['R2','MSE'])

In [40]:
df = df.T

In [42]:
df[df['R2'] >= 0.7].sort_values('MSE')

Unnamed: 0,R2,MSE
xgb_reg,0.870814,0.098475
rf_reg,0.900534,0.103697
dt_reg,0.999115,0.127857
