In [1]:
# EDA Exploratory Data Analysisの略らしい
# AUC Area Under the Curve ポジティブな分類とネガティブな分類の差を評価する指標 値が高いほどいいモデル 1が最高 0.5がランダム

# importするライブラリ
import warnings
warnings.filterwarnings("ignore")

import optuna # ハイパーパラメータの自動最適化フレームワーク
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from sklearn.model_selection import GroupKFold, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

# ここまでは大体テンプレ




In [24]:
# まず最初は絶対これ
train_data = pd.read_csv("./dataset/train.csv")
test_data = pd.read_csv("./dataset/test.csv")
original_data = pd.read_csv("./dataset/Rainfall.csv")

In [25]:
# sampleだとランダムに行を取ってくる
train_data.sample(5)

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
1311,1311,217,1008.3,29.1,27.9,25.8,23.2,72.0,69.0,7.1,70.0,25.9,1
1255,1255,161,1009.3,31.7,29.4,27.6,25.6,84.0,86.0,1.6,190.0,21.6,1
1926,1926,102,1006.9,27.5,25.0,24.9,22.1,89.0,88.0,0.5,240.0,10.2,0
1528,1528,69,1023.8,17.2,14.8,11.7,8.4,69.0,22.0,9.0,20.0,12.5,0
144,144,145,1008.7,33.0,30.2,28.4,25.7,78.0,53.0,10.0,220.0,10.9,0


In [26]:
original_data.sample(5)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
214,2,1011.6,32.3,28.9,26.8,24.5,78,54,no,11.4,100.0,9.8
167,16,1006.7,31.4,29.4,28.2,25.9,82,88,yes,1.6,230.0,23.2
54,24,1027.5,15.5,13.7,12.3,10.0,78,88,yes,0.0,20.0,28.0
93,3,1014.6,26.5,23.0,21.0,20.7,87,88,no,2.3,50.0,9.2
15,16,1013.5,17.1,16.4,15.5,15.6,95,93,yes,0.0,60.0,40.0


In [27]:
original_data.columns

Index(['day', 'pressure ', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity ', 'cloud ', 'rainfall', 'sunshine', '         winddirection',
       'windspeed'],
      dtype='object')

In [28]:
# 名前の前後にある空白の削除
original_data.columns = original_data.columns.str.strip()

In [29]:
# rainfallのyes, noを1, 0に置き換え
original_data["rainfall"] = original_data["rainfall"].map({"yes": 1, "no": 0})

In [30]:
train_rows, train_columns = train_data.shape
test_rows, test_columns = test_data.shape
original_rows, original_columns = original_data.shape

print("train_data:")
print(f"Number of Rows: {train_rows}")
print(f"Number of Columns: {train_columns}")

print("train_data:")
print(f"Number of Rows: {test_rows}")
print(f"Number of Columns: {test_columns}")

print("train_data:")
print(f"Number of Rows: {original_rows}")
print(f"Number of Columns: {original_columns}")

train_data:
Number of Rows: 2190
Number of Columns: 13
train_data:
Number of Rows: 730
Number of Columns: 12
train_data:
Number of Rows: 366
Number of Columns: 12


In [31]:
# 欠損値, 固有値, データのタイプを探る
missing_values_train = pd.DataFrame({"Feature": train_data.columns,
                                     "[TRAIN] No. of Missing Values": train_data.isnull().sum().values,
                                     "[TRAIN] % of Missing Values": (train_data.isnull().sum().values) / len(train_data) * 100})
missing_values_test = pd.DataFrame({"Feature": test_data.columns,
                                     "[TEST] No. of Missing Values": test_data.isnull().sum().values,
                                     "[TEST] % of Missing Values": (test_data.isnull().sum().values) / len(test_data) * 100})
missing_values_original = pd.DataFrame({"Feature": original_data.columns,
                                     "[ORIGINAL] No. of Missing Values": original_data.isnull().sum().values,
                                     "[ORIGINAL] % of Missing Values": (original_data.isnull().sum().values) / len(original_data) * 100})

unique_values = pd.DataFrame({"Feature": train_data.columns,
                              "No. of Unique Values[FROM TRAIN]": train_data.nunique().values})
feature_types = pd.DataFrame({"Feature": train_data.columns,
                              "Datatype": train_data.dtypes })

merged_df = pd.merge(missing_values_train, missing_values_test, on="Feature", how="left")
merged_df = pd.merge(merged_df, missing_values_original, on="Feature", how="left")
merged_df = pd.merge(merged_df, unique_values, on="Feature", how="left")
merged_df = pd.merge(merged_df, feature_types, on="Feature", how="left")

merged_df.style.background_gradient(cmap="viridis")

Unnamed: 0,Feature,[TRAIN] No. of Missing Values,[TRAIN] % of Missing Values,[TEST] No. of Missing Values,[TEST] % of Missing Values,[ORIGINAL] No. of Missing Values,[ORIGINAL] % of Missing Values,No. of Unique Values[FROM TRAIN],Datatype
0,id,0,0.0,0.0,0.0,,,2190,int64
1,day,0,0.0,0.0,0.0,0.0,0.0,365,int64
2,pressure,0,0.0,0.0,0.0,0.0,0.0,236,float64
3,maxtemp,0,0.0,0.0,0.0,0.0,0.0,219,float64
4,temparature,0,0.0,0.0,0.0,0.0,0.0,198,float64
5,mintemp,0,0.0,0.0,0.0,0.0,0.0,199,float64
6,dewpoint,0,0.0,0.0,0.0,0.0,0.0,218,float64
7,humidity,0,0.0,0.0,0.0,0.0,0.0,49,float64
8,cloud,0,0.0,0.0,0.0,0.0,0.0,78,float64
9,sunshine,0,0.0,0.0,0.0,0.0,0.0,120,float64


In [33]:
original_data.head(5)

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,1,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,1,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,1,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,1,0.0,40.0,13.7


In [37]:
# 重複している行がないかチェック
train_duplicates = train_data.duplicated().sum()
test_duplicates = test_data.duplicated().sum()
original_duplicates = original_data.duplicated().sum()

print(f"Number of duplicate rows in train_data: {train_duplicates}")
print(f"Number of duplicate rows in test_data: {test_duplicates}")
print(f"Number of duplicate rows in original_data: {original_duplicates}")

Number of duplicate rows in train_data: 0
Number of duplicate rows in test_data: 0
Number of duplicate rows in original_data: 0


In [40]:
# データセットの詳細な内容の確認
train_data.describe().T.style.background_gradient(cmap="viridis")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,2190.0,1094.5,632.342866,0.0,547.25,1094.5,1641.75,2189.0
day,2190.0,179.948402,105.203592,1.0,89.0,178.5,270.0,365.0
pressure,2190.0,1013.602146,5.655366,999.0,1008.6,1013.0,1017.775,1034.6
maxtemp,2190.0,26.365799,5.65433,10.4,21.3,27.8,31.2,36.0
temparature,2190.0,23.953059,5.22241,7.4,19.3,25.5,28.4,31.5
mintemp,2190.0,22.170091,5.05912,4.0,17.7,23.85,26.4,29.8
dewpoint,2190.0,20.454566,5.288406,-0.3,16.8,22.15,25.0,26.7
humidity,2190.0,82.03653,7.800654,39.0,77.0,82.0,88.0,98.0
cloud,2190.0,75.721918,18.026498,2.0,69.0,83.0,88.0,100.0
sunshine,2190.0,3.744429,3.626327,0.0,0.4,2.4,6.8,12.1


In [45]:
original_data.describe().T.style.background_gradient(cmap="viridis")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,366.0,183.5,105.799338,1.0,92.25,183.5,274.75,366.0
pressure,366.0,1013.742623,6.414776,998.5,1008.5,1013.0,1018.1,1034.6
maxtemp,366.0,26.191257,5.978343,7.1,21.2,27.75,31.2,36.3
temparature,366.0,23.747268,5.632813,4.9,18.825,25.45,28.6,32.4
mintemp,366.0,21.894536,5.594153,3.1,17.125,23.7,26.575,30.0
dewpoint,366.0,19.989071,5.997021,-0.4,16.125,21.95,25.0,26.7
humidity,366.0,80.177596,10.06247,36.0,75.0,80.5,87.0,98.0
cloud,366.0,71.128415,21.798012,0.0,58.0,80.0,88.0,100.0
rainfall,366.0,0.680328,0.466988,0.0,0.0,1.0,1.0,1.0
sunshine,366.0,4.419399,3.934398,0.0,0.5,3.5,8.2,12.1


In [46]:
original_data["day"] = range(1, len(original_data)+1)
original_data["day"].describe()

366