# Insurance_Starter (Tutorial)
---


In [53]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./Insurance/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

In [54]:
# 제출용
id = y_test['id']

In [55]:
X_train.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [56]:
X_test.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [57]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 209 to 1140
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1070 non-null   int64  
 1   age       1070 non-null   int64  
 2   sex       1070 non-null   object 
 3   bmi       1070 non-null   float64
 4   children  1070 non-null   int64  
 5   smoker    1070 non-null   object 
 6   region    1070 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 66.9+ KB


In [58]:
col_cat = ['sex','smoker','region']
col_num = ['age','bmi','children']


In [59]:
X_train_cat = pd.get_dummies(X_train[col_cat])
X_train_cat

X_test_cat = pd.get_dummies(X_test[col_cat])
X_test_cat


Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1088,0,1,1,0,0,0,1,0
1157,1,0,1,0,0,1,0,0
1267,0,1,0,1,1,0,0,0
506,0,1,1,0,0,1,0,0
659,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
1275,0,1,1,0,0,0,0,1
88,1,0,1,0,0,1,0,0
646,0,1,1,0,0,1,0,0
654,1,0,1,0,0,0,1,0


In [60]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train['age'] = scaler.fit_transform(X_train[['age']])
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_train['children'] = scaler.fit_transform(X_train[['children']])

X_test['age'] = scaler.fit_transform(X_test[['age']])
X_test['bmi'] = scaler.fit_transform(X_test[['bmi']])
X_test['children'] = scaler.fit_transform(X_test[['children']])


In [61]:
X_train = pd.concat([X_train[col_num],X_train_cat],axis=1)
X_train

X_test = pd.concat([X_test[col_num],X_test_cat],axis=1)
X_test

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1088,0.739130,0.958451,0.2,0,1,1,0,0,0,1,0
1157,0.108696,0.185395,0.4,1,0,1,0,0,1,0,0
1267,0.130435,0.433585,0.0,0,1,0,1,1,0,0,0
506,0.086957,0.442556,0.2,0,1,1,0,0,1,0,0
659,0.847826,0.361819,0.8,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1275,0.847826,0.201763,0.0,0,1,1,0,0,0,0,1
88,0.608696,0.328927,0.0,1,0,1,0,0,1,0,0
646,0.456522,0.281083,0.2,0,1,1,0,0,1,0,0
654,0.891304,0.563739,0.0,1,0,1,0,0,0,1,0


In [62]:
y_train = y_train['charges']
y_test = y_test['charges']

In [67]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

AttributeError: 'RandomForestRegressor' object has no attribute 'mean_squared_error'

In [68]:
y_pred = model.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

5140.464029794943

In [65]:
# 제출
submission = pd.DataFrame(data={'id':id,'charges':y_pred})
submission

Unnamed: 0,id,charges
1088,1088,12074.875868
1157,1157,5579.477750
1267,1267,34339.235870
506,506,2690.565747
659,659,16915.479243
...,...,...
1275,1275,11115.600209
88,88,13169.176725
646,646,6658.150912
654,654,16126.467049


In [66]:
submission.to_csv('submission_charges.csv',index=False)