In [272]:
# Logistic Regression plus Ensemble
# https://www.kaggle.com/code/itasps/0-89702-logistic-regression-plus-ensemble

# *rainfall.csvは別のコンペからもってきたもの

# EDA
import numpy as np
import pandas as pd

# model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [273]:
DATA_PATH = './data/'
train_df = pd.read_csv(DATA_PATH + 'train.csv', index_col='id')
test_df = pd.read_csv(DATA_PATH + 'test.csv', index_col='id')
train_extra_df = pd.read_csv(DATA_PATH + 'rainfall.csv')

In [274]:
display(train_df.head())
display(test_df.head())
display(train_extra_df.head())

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7


In [275]:
# <データの前処理>
# ・コルムの中に含まれる空白を除去
# ・カテゴリ変数を数値化
# ・文字列型をfloat型に変換
train_extra_df.columns = train_extra_df.columns.str.replace(' ', '')
train_extra_df['rainfall'] = train_extra_df['rainfall'].map({'no': 0, 'yes':1})
train_extra_df['humidity'] = train_extra_df['humidity'].astype(float)
train_extra_df['cloud'] = train_extra_df['cloud'].astype(float)

# <特徴量の追加>
train_features = list(train_df)
train_extra_df = train_extra_df[train_features] # train_dfと一致する特徴量だけを抽出
train_df = pd.concat([train_df, train_extra_df], axis=0, ignore_index=True) 
train_df = train_df.drop_duplicates() # 重複を削除
train_df.shape

(2556, 12)

In [276]:
# <cloud, sunshine, dayのmax, minを調べる
display(train_df.describe())
display(test_df.describe())

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2556.0,2556.0,2556.0,2556.0,2556.0,2556.0,2556.0,2556.0,2556.0,2555.0,2555.0,2556.0
mean,156.437402,1013.622261,26.340806,23.923592,22.130634,20.387911,81.770344,75.064163,3.84108,104.383679,21.766458,0.742958
std,113.147285,5.769101,5.700976,5.282473,5.138897,5.396788,8.18672,18.678173,3.678826,80.242979,9.921727,0.437088
min,1.0,998.5,7.1,4.9,3.1,-0.4,36.0,0.0,0.0,10.0,4.4,0.0
25%,44.0,1008.6,21.3,19.3,17.6,16.8,77.0,68.0,0.4,40.0,14.1,0.0
50%,148.0,1013.0,27.8,25.5,23.8,22.1,81.0,83.0,2.4,70.0,20.5,1.0
75%,255.0,1017.8,31.2,28.4,26.5,25.0,87.0,88.0,7.0,200.0,27.9,1.0
max,365.0,1034.6,36.3,32.4,30.0,26.7,98.0,100.0,12.1,350.0,59.5,1.0


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,729.0,730.0
mean,183.0,1013.503014,26.372466,23.963288,22.110274,20.460137,82.669863,76.360274,3.664384,103.923182,22.484247
std,105.438271,5.505871,5.672521,5.278098,5.170744,5.391169,7.818714,17.934121,3.639272,81.695458,9.954779
min,1.0,1000.0,7.4,5.9,4.2,-0.0,39.0,0.0,0.0,10.0,4.5
25%,92.0,1008.725,21.6,19.825,17.825,16.8,77.25,69.0,0.325,40.0,14.5
50%,183.0,1012.7,27.8,25.65,23.9,22.3,82.0,83.0,2.2,70.0,21.3
75%,274.0,1017.6,31.0,28.375,26.4,25.0,89.0,88.0,6.675,200.0,28.4
max,365.0,1032.2,35.8,31.8,29.1,26.7,98.0,100.0,11.8,300.0,59.5


In [277]:
# train_dfのコルムの並べかえ(test_dfと同じ順番にする)
features = list(test_df)
features.append('rainfall')
train_df=train_df[features]

In [278]:
train_df.sample(5)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
1499,40,1021.2,16.5,15.9,13.1,12.6,92.0,90.0,0.0,20.0,29.8,1
648,284,1010.3,26.6,24.1,23.3,19.9,80.0,58.0,5.7,30.0,23.7,0
1233,139,1008.2,32.3,29.0,27.8,25.3,86.0,72.0,7.6,20.0,9.2,1
125,126,1012.5,27.9,25.8,24.4,23.1,82.0,88.0,1.6,80.0,39.3,1
1097,3,1016.4,16.7,15.6,14.7,11.4,79.0,91.0,0.0,60.0,55.5,1


In [279]:
test_df.sample(5)

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2359,170,1005.0,35.6,31.0,28.5,25.4,79.0,43.0,10.1,230.0,11.3
2315,126,1009.8,29.2,25.8,24.5,22.2,79.0,61.0,6.0,70.0,27.6
2641,87,1008.1,26.2,24.9,23.6,23.9,82.0,89.0,0.1,70.0,15.5
2582,28,1019.6,25.3,21.8,20.7,18.0,79.0,78.0,0.9,140.0,15.3
2840,286,1013.2,27.1,25.3,24.9,16.3,61.0,20.0,8.8,60.0,9.1


In [280]:
# 目的変数の分布の確認
train_df['rainfall'].value_counts()

rainfall
1    1899
0     657
Name: count, dtype: int64

In [281]:
# データの欠損値を確認する
# sort_values()を使うと、より見やすくなる。
display(train_df.isnull().sum()) #.sort_values(ascending=False)
display(test_df.isnull().sum())  #.sort_values(ascending=False)

day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        1
rainfall         0
dtype: int64

day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [282]:
# 欠損値の補完(補完する行の中央値で補完する)
test_df['winddirection'] = test_df['winddirection'].fillna(value=test_df['winddirection'].mean())
train_df['winddirection']=train_df['winddirection'].fillna(value=train_df['winddirection'].mean())
train_df['windspeed']=train_df['windspeed'].fillna(value=train_df['windspeed'].mean())

In [283]:
# rainfallのそれぞれのサンプル数を確認してLogisticRegressionで使用するclass_weightを計算する
display(train_df['rainfall'].value_counts())

class_weight={0:657.0/(1899+657), 1:1899.0/(1899+657)}
class_weight

rainfall
1    1899
0     657
Name: count, dtype: int64

{0: 0.25704225352112675, 1: 0.7429577464788732}

In [284]:
# モデルの作成

y = train_df['rainfall'] #目的変数(ターゲット)

drop_features = ['cloud','humidity','rainfall']
X = train_df.drop(columns=drop_features, axis=1) # 訓練データの特徴量
print(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# LogisticRegression(ロジスティック回帰モデル)
# solver='liblinear'    : Liblinear(小規模データセット向けのソルバー)
# penalty='l1'          : L1正則化 → スパースなモデルを学習
# max_iter=10000        : 最大反復回数(収束しやすくするために繰り返し回数を増加)
# random_state=88       : 乱数シード（結果を再現可能にするための固定値）
# C=1.0                 : 正則化の強さ（小さいほど強い正則化、大きいほど弱い正則化）
clf_1 = LogisticRegression(solver='liblinear', penalty='l1', max_iter=10000, random_state=88, C=1.0)
clf_1.fit(X_train, y_train)

y_pred_1 = clf_1.predict(X_test) # クラスごとの0、1を予測する
acc = accuracy_score(y_test, y_pred_1) *100
print(f"Logistic Regression model accuracy: {acc:.2f}%")

   day  pressure  maxtemp  temparature  mintemp  dewpoint  sunshine  \
0    1    1017.4     21.2         20.6     19.9      19.4       1.1   
1    2    1019.5     16.2         16.9     15.8      15.4       0.0   
2    3    1024.1     19.4         16.1     14.6       9.3       8.3   
3    4    1013.4     18.1         17.8     16.9      16.8       0.0   
4    5    1021.8     21.3         18.4     15.2       9.6       3.6   

   winddirection  windspeed  
0           60.0       17.2  
1           50.0       21.9  
2           70.0       18.1  
3           60.0       35.6  
4           40.0       24.8  
Logistic Regression model accuracy: 81.05%


In [285]:
# テストデータに対する予測を行う(1)
drop_features.remove('rainfall')
_test=test_df.drop(columns=drop_features, axis=1)
test_preds_1 = clf_1.predict_proba(_test)[:,1] # クラスごとの確率を予測する

In [286]:
y=train_df['rainfall']

drop_features=['day', 'mintemp', 'pressure', 'sunshine', 'winddirection', 'windspeed', 'maxtemp', 'dewpoint', 'temparature', 'rainfall']
X=train_df.drop(columns=drop_features, axis=1)
print(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

# LogisticRegression(ロジスティック回帰モデル)
# solver='newton-cg'    : newton-cg(大規模データセット向けのソルバー)
# penalty=None          : 正則化なし
# max_iter=10000        : 最大反復回数(収束しやすくするために繰り返し回数を増加)
# random_state=23       : 乱数シード（結果を再現可能にするための固定値）
# C=1.0                 : 正則化の強さ（小さいほど強い正則化、大きいほど弱い正則化）
clf_2 = LogisticRegression(solver='newton-cg', penalty=None, max_iter=10000, random_state=23, C=1.0)
clf_2.fit(X_train, y_train)

y_pred_2 = clf_2.predict(X_test)
acc = accuracy_score(y_test, y_pred_2) *100
print(f"Logistic Regression model accuracy: {acc:.2f}%")

   humidity  cloud
0      87.0   88.0
1      95.0   91.0
2      75.0   47.0
3      95.0   95.0
4      52.0   45.0
Logistic Regression model accuracy: 83.20%


In [287]:
# テストデータに対する予測を行う(2)
drop_features.remove('rainfall')
_test=test_df.drop(columns=drop_features, axis=1)
test_preds_2 = clf_2.predict_proba(_test)[:,1]

In [288]:
sub = pd.DataFrame({"id": test_df.index, "rainfall": list(test_preds_1)})
sub.to_csv("./output/submission_pred1.csv", index=False)
sub.head()

Unnamed: 0,id,rainfall
0,2190,0.957779
1,2191,0.953586
2,2192,0.863109
3,2193,0.216346
4,2194,0.042856


In [289]:
sub = pd.DataFrame({"id": test_df.index, "rainfall": list(test_preds_2)})
sub.to_csv("./output/submission_pred2.csv", index=False)
sub.head()

Unnamed: 0,id,rainfall
0,2190,0.987413
1,2191,0.988197
2,2192,0.969929
3,2193,0.21749
4,2194,0.194735


In [293]:
sub['rainfall'] = 0.5 * test_preds_1 + 0.5 * test_preds_2 
sub.to_csv("./output/submission.csv", index=False)
display(sub.head())

Unnamed: 0,id,rainfall
0,2190,0.972596
1,2191,0.970891
2,2192,0.916519
3,2193,0.216918
4,2194,0.118795
