In [1]:
import pandas as pd
train = pd.read_csv("train.csv",index_col=0)
test = pd.read_csv("test.csv",index_col=0)
sample_submit = pd.read_csv("sample_submit.csv",index_col=0,header=None)

In [2]:
train.head()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,region,charges
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26,male,32.665465,3,no,southeast,0
1,41,male,29.798725,1,no,southwest,0
2,28,male,32.722029,0,yes,northwest,1
3,20,female,38.429831,2,no,southeast,0
4,45,female,29.641854,1,no,northwest,0


In [3]:
test.head()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,region
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,19,male,24.365178,1,no,northeast
23,59,male,33.997763,0,yes,northeast
27,42,female,29.28345,0,no,southwest
28,30,male,24.903725,1,no,northwest
29,41,female,29.644536,0,no,southwest


In [4]:
print(train.shape)
print(test.shape)

(1600, 7)
(400, 6)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1600 non-null   int64  
 1   sex       1600 non-null   object 
 2   bmi       1600 non-null   float64
 3   children  1600 non-null   int64  
 4   smoker    1600 non-null   object 
 5   region    1600 non-null   object 
 6   charges   1600 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 100.0+ KB


In [6]:
train['charges'].value_counts()

0    1256
1     198
2     146
Name: charges, dtype: int64

In [7]:
train.describe()

Unnamed: 0,age,bmi,children,charges
count,1600.0,1600.0,1600.0,1600.0
mean,38.985,32.424376,1.014375,0.30625
std,13.555012,5.766915,1.259031,0.628656
min,18.0,20.627626,0.0,0.0
25%,27.0,28.634267,0.0,0.0
50%,40.0,32.268786,1.0,0.0
75%,50.0,37.069581,2.0,0.0
max,64.0,47.290644,5.0,2.0


In [8]:
train[['sex','charges']].groupby(['sex']).mean()

Unnamed: 0_level_0,charges
sex,Unnamed: 1_level_1
female,0.227931
male,0.376932


In [9]:
train[['smoker','charges']].groupby(['smoker']).mean()

Unnamed: 0_level_0,charges
smoker,Unnamed: 1_level_1
no,0.070579
yes,1.182891


In [10]:
train[['region','charges']].groupby(['region']).mean()

Unnamed: 0_level_0,charges
region,Unnamed: 1_level_1
northeast,0.277778
northwest,0.230769
southeast,0.465686
southwest,0.245333


In [11]:
data = pd.concat([train,test], sort=False)

In [12]:
data['sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['smoker'].replace(['no', 'yes'], [0, 1], inplace=True)
data['region'] = data['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}).astype(int)

In [13]:
data.head(10)

Unnamed: 0_level_0,age,sex,bmi,children,smoker,region,charges
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26,0,32.665465,3,0,2,0.0
1,41,0,29.798725,1,0,3,0.0
2,28,0,32.722029,0,1,1,1.0
3,20,1,38.429831,2,0,2,0.0
4,45,1,29.641854,1,0,1,0.0
5,20,0,37.785937,2,1,2,1.0
6,37,0,43.393374,2,0,0,0.0
7,41,1,32.280407,1,0,0,0.0
8,40,0,28.865828,1,0,0,0.0
9,54,1,33.870887,2,0,0,0.0


In [14]:
data.shape

(2000, 7)

In [15]:
data.reset_index(drop=True, inplace=True)
train = data.loc[:(len(train)-1)]
test = data.loc[len(train):]

In [16]:
y_train = train['charges']
X_train = train.drop('charges', axis=1)
X_test = test.drop('charges', axis=1)

In [17]:
X_train.shape

(1600, 6)

In [18]:
X_test.shape

(400, 6)

In [19]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size=0.25,
                                 random_state=0, stratify=y_train)

In [20]:
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

max_score = 0
RFC_grid = {RandomForestClassifier(): {"n_estimators": [i for i in range(1, 101)],
                                       "criterion": ["gini", "entropy","log_loss"],
                                       "max_depth":[i for i in range(1, 10)],
                                      }}
for model,param in tqdm(RFC_grid.items()):
    clf = GridSearchCV(model,param)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    score = f1_score(y_valid,y_pred,average='macro')
    if max_score < score:
        max_score = score
        best_param = clf.best_params_
        best_model = model.__class__.__name__

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [15:40<00:00, 940.35s/it]


In [21]:
print("ベストスコア:{}".format(max_score))
print("モデル:{}".format(best_model))
print("パラメーター:{}".format(best_param))

ベストスコア:0.6969487030974834
モデル:RandomForestClassifier
パラメーター:{'criterion': 'log_loss', 'max_depth': 8, 'n_estimators': 96}


In [22]:
y_pred = clf.predict(X_test)
y_pred.shape

(400,)

In [23]:
sample_submit = pd.read_csv("sample_submit.csv",header=None, names=['id','charges'])

In [24]:
sample_submit['charges'] = y_pred

In [25]:
sample_submit['charges'] = sample_submit['charges'].astype('int')


In [26]:
pd.DataFrame(sample_submit).to_csv('randomforest3.csv', index=False, header=False)