In [51]:
# 16回目 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

def one_hot_encoding(data):
    '''
    ArithmeticErrorカテゴリーをOne-Hotエンコーディング
    '''
    data_categories = data[['sex','smoker','region']]
    ohe = OneHotEncoder(sparse=False, categories='auto',dtype=int)
    sl_np = ohe.fit_transform(data_categories)
    # データを作り替える
    sl_df = pd.DataFrame(data = sl_np, columns = ["sex_female","sex_male","smoker_no","smoker_yes","region_southeast","region_southwest","region_northwest","region_northeast"])
    return pd.concat([data.drop(['sex','smoker','region'], axis=1), sl_df], axis=1)

# trainを変換
train_feature = one_hot_encoding(train)
# testの方も変換
test_feature = one_hot_encoding(test)

target_columns = ['age', 'bmi', 'sex_female', 'sex_male', 'smoker_no', 'smoker_yes','region_northwest']
# 特徴量と目的変数の分割
X = train_feature[target_columns]
y = train_feature['charges']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

# モデルの学習
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# 評価用データの予測
X_test = test_feature[target_columns]
y_pred = model.predict(X_test)

# 予測結果の表示
# print(y_pred)

# 正解率の計算
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)
# クラスごとのF1-scoreの平均
print("macro-F1:", f1_score(y_val, y_pred, average='macro'))

# マクロ平均のF1スコアを取得
'''
precision: 適合率
モデルが ポジティブ(真)と予測した数を分母，その中で実際に正解した数を分子にした値
高い適合率は、予測が正確であることを示す

recall: 再現率
正解データ中の真の数を分母，その中でモデルが正解した数を分子にした値
高い再現率は、モデルがポジティブ(Positive)なデータを見逃さないことを示す

f1-score: F値
precisionとrecallの調和平均

support: 再現率
正解データに含まれている個数

accuracy: 正解率
全体の予測の正確さを示す
'''
report = classification_report(y_val, y_pred, digits=3)
print(report)

# 結果をCSVファイルとして保存
sample[1] = y_pred
sample.to_csv("submission16.csv",index=None,header=None)

Accuracy: 0.69
macro-F1: 0.3505198012060428
              precision    recall  f1-score   support

           0      0.802     0.837     0.819       319
           1      0.206     0.149     0.173        47
           2      0.061     0.059     0.060        34

    accuracy                          0.690       400
   macro avg      0.356     0.348     0.351       400
weighted avg      0.669     0.690     0.679       400



In [45]:
# 17回目 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

def one_hot_encoding(data):
    '''
    ArithmeticErrorカテゴリーをOne-Hotエンコーディング
    '''
    data_categories = data[['sex','smoker','region']]
    ohe = OneHotEncoder(sparse=False, categories='auto',dtype=int)
    sl_np = ohe.fit_transform(data_categories)
    # データを作り替える
    sl_df = pd.DataFrame(data = sl_np, columns = ["sex_female","sex_male","smoker_no","smoker_yes","region_southeast","region_southwest","region_northwest","region_northeast"])
    return pd.concat([data.drop(['sex','smoker','region'], axis=1), sl_df], axis=1)

# trainを変換
train_feature = one_hot_encoding(train)
# testの方も変換
test_feature = one_hot_encoding(test)

# BMIに非線形変換を適用
train_feature['bmi_transformed'] = np.sqrt(train_feature['bmi'])
test_feature['bmi_transformed'] = np.sqrt(test_feature['bmi'])

target_columns = ['age', 'bmi_transformed', 'sex_female', 'sex_male', 'smoker_no', 'smoker_yes','region_northwest']
# 特徴量と目的変数の分割
X = train_feature[target_columns]
y = train_feature['charges']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

# モデルの学習
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# 評価用データの予測
X_test = test_feature[target_columns]
y_pred = model.predict(X_test)

# 予測結果の表示
# print(y_pred)

# 正解率の計算
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)
# クラスごとのF1-scoreの平均
print("macro-F1:", f1_score(y_val, y_pred, average='macro'))

# マクロ平均のF1スコアを取得
report = classification_report(y_val, y_pred, digits=3)
print(report)

# 結果をCSVファイルとして保存
sample[1] = y_pred
sample.to_csv("submission17.csv",index=None,header=None)

Accuracy: 0.6925
macro-F1: 0.360450432662113
              precision    recall  f1-score   support

           0      0.804     0.837     0.820       319
           1      0.206     0.149     0.173        47
           2      0.088     0.088     0.088        34

    accuracy                          0.693       400
   macro avg      0.366     0.358     0.360       400
weighted avg      0.673     0.693     0.682       400



In [50]:
# 18回目 
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
import numpy as np

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

def one_hot_encoding(data):
    '''
    ArithmeticErrorカテゴリーをOne-Hotエンコーディング
    '''
    data_categories = data[['sex','smoker','region']]
    ohe = OneHotEncoder(sparse=False, categories='auto',dtype=int)
    sl_np = ohe.fit_transform(data_categories)
    # データを作り替える
    sl_df = pd.DataFrame(data = sl_np, columns = ["sex_female","sex_male","smoker_no","smoker_yes","region_southeast","region_southwest","region_northwest","region_northeast"])
    return pd.concat([data.drop(['sex','smoker','region'], axis=1), sl_df], axis=1)

# trainを変換
train_feature = one_hot_encoding(train)
# testの方も変換
test_feature = one_hot_encoding(test)

# BMIに非線形変換を適用
train_feature['bmi_transformed'] = np.sqrt(train_feature['bmi'])
test_feature['bmi_transformed'] = np.sqrt(test_feature['bmi'])

target_columns = ['age', 'bmi_transformed', 'sex_female', 'sex_male', 'smoker_no', 'children', 'smoker_yes','region_northwest']
# 特徴量と目的変数の分割
X = train_feature[target_columns]
y = train_feature['charges']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

# 各分類器を定義
gb_model = GradientBoostingClassifier(n_estimators=1000, random_state=42)
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# アンサンブル学習モデルの定義
ensemble_model = VotingClassifier(estimators=[('gb', gb_model), ('lr', lr_model)], voting='hard')

# アンサンブル学習モデルの訓練
ensemble_model.fit(X_train, y_train)

# アンサンブル学習モデルの予測
X_test = test_feature[target_columns]
ensemble_predictions = ensemble_model.predict(X_test)
# print(ensemble_predictions)

# 正解率の計算
accuracy = accuracy_score(y_val, ensemble_predictions)
print("Accuracy:", accuracy)
# クラスごとのF1-scoreの平均 ※多分こっちが高い方がよさそう
print("macro-F1:", f1_score(y_val, ensemble_predictions, average='macro'))

# マクロ平均のF1スコアを取得
report = classification_report(y_val, ensemble_predictions, digits=3)
print(report)

# 結果をCSVファイルとして保存
sample[1] = ensemble_predictions
sample.to_csv("submission18.csv", index=None, header=None)

Accuracy: 0.7125
macro-F1: 0.3554155593603708
              precision    recall  f1-score   support

           0      0.803     0.868     0.834       319
           1      0.185     0.106     0.135        47
           2      0.107     0.088     0.097        34

    accuracy                          0.713       400
   macro avg      0.365     0.354     0.355       400
weighted avg      0.671     0.713     0.689       400



In [3]:
# 19回目 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

def one_hot_encoding(data):
    '''
    ArithmeticErrorカテゴリーをOne-Hotエンコーディング
    '''
    data_categories = data[['sex','smoker','region']]
    ohe = OneHotEncoder(sparse=False, categories='auto',dtype=int)
    sl_np = ohe.fit_transform(data_categories)
    # データを作り替える
    sl_df = pd.DataFrame(data = sl_np, columns = ["sex_female","sex_male","smoker_no","smoker_yes","region_southeast","region_southwest","region_northwest","region_northeast"])
    return pd.concat([data.drop(['sex','smoker','region'], axis=1), sl_df], axis=1)

# trainを変換
train_feature = one_hot_encoding(train)
# testの方も変換
test_feature = one_hot_encoding(test)

# BMIに非線形変換を適用
train_feature['bmi_transformed'] = np.sqrt(train_feature['bmi'])
test_feature['bmi_transformed'] = np.sqrt(test_feature['bmi'])

target_columns = ['age', 'bmi_transformed', 'sex_female', 'sex_male', 'smoker_no', 'children', 'smoker_yes','region_northwest']
# 特徴量と目的変数の分割
X = train_feature[target_columns]
y = train_feature['charges']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

# モデルの学習
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# 評価用データの予測
X_test = test_feature[target_columns]
y_pred = model.predict(X_test)

# 予測結果の表示
# print(y_pred)

# 正解率の計算
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)
# クラスごとのF1-scoreの平均
print("macro-F1:", f1_score(y_val, y_pred, average='macro'))

# マクロ平均のF1スコアを取得
report = classification_report(y_val, y_pred, digits=3)
print(report)

# 結果をCSVファイルとして保存
sample[1] = y_pred
sample.to_csv("submission19.csv",index=None,header=None)

Accuracy: 0.6925
macro-F1: 0.360450432662113
              precision    recall  f1-score   support

           0      0.804     0.837     0.820       319
           1      0.206     0.149     0.173        47
           2      0.088     0.088     0.088        34

    accuracy                          0.693       400
   macro avg      0.366     0.358     0.360       400
weighted avg      0.673     0.693     0.682       400



In [None]:
# 20回目 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

def one_hot_encoding(data):
    '''
    ArithmeticErrorカテゴリーをOne-Hotエンコーディング
    '''
    data_categories = data[['sex','smoker','region']]
    ohe = OneHotEncoder(sparse=False, categories='auto',dtype=int)
    sl_np = ohe.fit_transform(data_categories)
    # データを作り替える
    sl_df = pd.DataFrame(data = sl_np, columns = ["sex_female","sex_male","smoker_no","smoker_yes","region_southeast","region_southwest","region_northwest","region_northeast"])
    return pd.concat([data.drop(['sex','smoker','region'], axis=1), sl_df], axis=1)

# trainを変換
train_feature = one_hot_encoding(train)
# testの方も変換
test_feature = one_hot_encoding(test)

# BMIに非線形変換を適用
train_feature['bmi_transformed'] = np.sqrt(train_feature['bmi'])
test_feature['bmi_transformed'] = np.sqrt(test_feature['bmi'])

target_columns = ['age', 'bmi_transformed', 'sex_female', 'sex_male', 'smoker_no', 'children', 'smoker_yes','region_northwest']
# 特徴量と目的変数の分割
X = train_feature[target_columns]
y = train_feature['charges']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

# モデルの学習
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# 評価用データの予測
X_test = test_feature[target_columns]
y_pred = model.predict(X_test)

# 予測結果の表示
# print(y_pred)

# 正解率の計算
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)
# クラスごとのF1-scoreの平均
print("macro-F1:", f1_score(y_val, y_pred, average='macro'))

# マクロ平均のF1スコアを取得
report = classification_report(y_val, y_pred, digits=3)
print(report)

# 結果をCSVファイルとして保存
sample[1] = y_pred
sample.to_csv("submission20.csv",index=None,header=None)