# Prepare

## Module

In [None]:
import re
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import math
from glob import glob
from sklearn.model_selection import train_test_split
import gc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data

In [None]:
paths = glob('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/train/*')
train_dfs = []
for path in paths:
  train_df = pd.read_csv(path)
  train_dfs.append(train_df)
train_df = pd.concat(train_dfs)
train_df.reset_index(drop=True, inplace = True)
test_df = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/test.csv')

  train_df = pd.read_csv(path)
  train_df = pd.read_csv(path)
  train_df = pd.read_csv(path)


In [None]:
del [path, paths, train_dfs]

In [None]:
path = '/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/'

# Preprocessing

## Drop unnecessary data

In [None]:
TARGET = '取引価格（総額）_log'

In [None]:
test_df[TARGET] = np.nan

In [None]:
df = pd.concat([train_df, test_df])

In [None]:
train_len = len(train_df)
#test_id = test_df['ID']

In [None]:
train_len

751560

In [None]:
del [train_df, test_df]

In [None]:
rm_cols = []
rm_cols += ['市区町村コード']
for i,v in df.nunique().iteritems():
    if v <= 1:
        rm_cols.append(i)

rm_cols

['市区町村コード',
 '種類',
 '地域',
 '土地の形状',
 '間口',
 '延床面積（㎡）',
 '前面道路：方位',
 '前面道路：種類',
 '前面道路：幅員（ｍ）']

In [None]:
df.drop(rm_cols, axis=1, inplace=True)

In [None]:
del [i, rm_cols, v]

## Simple one

In [None]:
def normalize_moyori(moyori):
    if moyori == moyori:
        if moyori == '30分?60分':
            moyori = 45
        elif moyori == '1H?1H30':
            moyori = 75
        elif moyori == '1H30?2H':
            moyori = 105
        elif moyori == '2H?':
            moyori = 120
        moyori = int(moyori)
    return moyori

In [None]:
def normalize_area(area):
    if area == area:
        area = int(re.sub('m\^2未満|㎡以上', '', str(area)))
    return area

In [None]:
def convert_wareki_to_seireki(wareki):
    if wareki == wareki:
        if wareki == '戦前':
            wareki = '昭和20年'
        value = wareki[2:-1]
        if value == '元':
            value = 1
        else:
            value = int(value)
        if '昭和' in wareki:
            seireki = 1925+value
        elif '平成' in wareki:
            seireki = 1988+value
        elif '令和' in wareki:
            seireki = 2018+value
    else:
        seireki = wareki
    return seireki

In [None]:
df['建築年'] = df['建築年'].apply(lambda x: convert_wareki_to_seireki(x))
df['面積（㎡）'] = df['面積（㎡）'].apply(lambda x: normalize_area(x))
df['最寄駅：距離（分）'] = df['最寄駅：距離（分）'].apply(lambda x: normalize_moyori(x))

In [None]:
df['改装'] = df['改装'].map(lambda x: 1 if x == '改装済' else 0)

In [None]:
enc_dic = {}
for i, e in enumerate(sorted(list(set(df['取引時点'].values)))):
    enc_dic[e] = i
df['取引時点_enc'] = df['取引時点'].map(enc_dic)

In [None]:
df['取引の年'] = df['取引時点'].apply(lambda x: int(x[:4]))

In [None]:
df['築年数'] =  df['取引の年']- df['建築年']

In [None]:
del [convert_wareki_to_seireki, e, enc_dic, i, normalize_area, normalize_moyori]

## 情報

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 772094 entries, 0 to 20533
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ID            772094 non-null  int64  
 1   都道府県名         772094 non-null  object 
 2   市区町村名         772094 non-null  object 
 3   地区名           771422 non-null  object 
 4   最寄駅：名称        769315 non-null  object 
 5   最寄駅：距離（分）     748889 non-null  float64
 6   間取り           743386 non-null  object 
 7   面積（㎡）         772094 non-null  int64  
 8   建築年           749595 non-null  float64
 9   建物の構造         750263 non-null  object 
 10  用途            688288 non-null  object 
 11  今後の利用目的       404084 non-null  object 
 12  都市計画          751568 non-null  object 
 13  建ぺい率（％）       746970 non-null  float64
 14  容積率（％）        746970 non-null  float64
 15  取引時点          772094 non-null  object 
 16  改装            772094 non-null  int64  
 17  取引の事情等        19335 non-null   object 
 18  取引価格（

## 特徴量生成
https://github.com/Anguschang582/Probspace---Re_estate---1st-place-solution

を参考

In [None]:
df_base = df.copy()

### Group method (numeric2cate) :

Apply statistics of numeric features in different categorical features group. For example, applying "mean" on "面積（㎡）" group by "市区町村コード". The statistics functions I used :

- Mean, max, min, std, sum, skewness, kurtosis
- Bayes mean
- IQR : q75 - q25
- IQR_ratio : q75 / q25
- Median absolute deviation : median( abs(x - median(x)) )
- Mean variance : std(x) / mean(x)
- hl_ratio : The ratio of numbers of the samples that higher and lower than - the mean (Ref, Table 2).
- MAD : Median Absolute Deviation : median( |x - median(x)| )(かぶってるため未実施)
- Beyond1std : Calculating the ratio beyond 1 std(1std不明のため未実施)
- Range : max - min
- Range_ratio : max / min
- Shapiro-Wilk Statistic
- diff and ratio : "x - mean(x)" or "x / mean(x)"(かぶってるためめmeanは未実施)
- Z-score : ( x-mean(x) ) / std(x)

#### 処理自体

In [None]:
df = df_base.copy()

In [None]:
# categorical、numericなカラム名を所得
## categorical
def find_categorical_columns(df):
    categorical_cols = []
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            categorical_cols.append(col)
        elif pd.api.types.is_object_dtype(df[col]):
            categorical_cols.append(col)
    return categorical_cols

## numeric
def find_numeric_columns(df):
    numeric_cols = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            numeric_cols.append(col)
    return numeric_cols

## 装填
targetcols = find_numeric_columns(df)
targetcols.remove('ID')
targetcols.remove(TARGET)

groupcols = find_categorical_columns(df)

In [None]:
from scipy.stats import skew, kurtosis, shapiro
pd.options.mode.chained_assignment = None

In [None]:
# functionの用意
## 複雑なやつ
def bayes_mean(x):
    n = len(x)
    mu0 = 0.5
    sigma0 = 1
    mu = np.mean(x)
    sigma = np.std(x)
    return (mu0 * sigma0**2 + n * mu * sigma**2) / (sigma0**2 + n * sigma**2)

def shapiro(x):
  n = len(x)
  if n < 3:
    return np.nan
  else:
    return scipy.stats.shapiro(x)[0]

## dic
func_dict = {'mean': np.mean,
             'max': np.max,
             'min': np.min,
             'std': np.std,
             'sum': np.sum,
             'skewness': skew,
             'kurtosis': kurtosis,
             'bayes_mean': bayes_mean,
             'per_25': lambda x: x.quantile(.25),
             'per_75': lambda x: x.quantile(.75),
             'iqr': lambda x: x.quantile(.75) - x.quantile(.25),
             'iqr_ratio': lambda x: x.quantile(.75) / x.quantile(.25),
             'mad': lambda x: np.median(np.abs(x - np.median(x))),
             'mean_variance': lambda x: np.std(x) / np.mean(x),
             'beyond_1std': lambda x: len(x[np.abs(x - np.mean(x)) > np.std(x)]) / len(x),
             'range': lambda x: np.max(x) - np.min(x),
             'range_ratio': lambda x: np.max(x) / np.min(x),
             'shapiro': shapiro
             }

In [None]:
for targetcol in targetcols:

  for groupcol in groupcols:

    for (n,i) in func_dict.items():
      en_dic = df.groupby(groupcol)[targetcol].agg(i).to_dict()
      df[groupcol + '_' + targetcol + '_' + n] = df[groupcol].map(en_dic)

    df['hl_diff_' + targetcol] = df[targetcol] - df[groupcol +'_'+ targetcol +'_mean']
    df['hl_ratio_' + targetcol]= df[targetcol] / df[groupcol +'_'+ targetcol +'_mean']
    df['zscore_' + targetcol] = (df[targetcol] - df[groupcol +'_'+ targetcol +'_mean']) / df[groupcol + '_' + targetcol + '_std']

  #groupby 完了
  #指示はされてないけどやっとこ
  df[targetcol + '_diff'] = df[targetcol] - df[targetcol].mean()
  df[targetcol + '_ratio']= df[targetcol] / df[targetcol].mean()
  df[targetcol + '_zscore'] = (df[targetcol] - df[targetcol].mean()) / df[targetcol].std()

  'range_ratio': lambda x: np.max(x) / np.min(x),
  f = lambda x: func(x, *args, **kwargs)
  f = lambda x: func(x, *args, **kwargs)
  'range_ratio': lambda x: np.max(x) / np.min(x),
  f = lambda x: func(x, *args, **kwargs)
  f = lambda x: func(x, *args, **kwargs)
  'iqr_ratio': lambda x: x.quantile(.75) / x.quantile(.25),
  'iqr_ratio': lambda x: x.quantile(.75) / x.quantile(.25),
  'mean_variance': lambda x: np.std(x) / np.mean(x),
  'range_ratio': lambda x: np.max(x) / np.min(x),
  'range_ratio': lambda x: np.max(x) / np.min(x),
  f = lambda x: func(x, *args, **kwargs)
  f = lambda x: func(x, *args, **kwargs)
  'iqr_ratio': lambda x: x.quantile(.75) / x.quantile(.25),
  'iqr_ratio': lambda x: x.quantile(.75) / x.quantile(.25),
  'mean_variance': lambda x: np.std(x) / np.mean(x),
  'range_ratio': lambda x: np.max(x) / np.min(x),
  'range_ratio': lambda x: np.max(x) / np.min(x),
  f = lambda x: func(x, *args, **kwargs)
  f = lambda x: func(x, *args, **kwargs)
  'range_ratio': lambda x:

In [None]:
del [bayes_mean, en_dic, find_categorical_columns, find_numeric_columns, func_dict, groupcol, i, kurtosis, shapiro, skew, targetcol]

In [None]:
for col in df.columns:
    if df[col].dtype == 'int64':
        df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')
del col

#### 特徴量重要度で切ります。多すぎる。(X_train, X_testに保存)
(1836→961個)

In [None]:
# 準備
## 元の変数の削除(最後だけtargetは残す)
df.drop(targetcols+groupcols, axis=1, inplace=True)
#df.drop(groupcols, axis=1, inplace=True)

## trainとtestの分離
traindf = df[:train_len]
testdf = df[train_len:]
del df

## Xとyの分離
### train
X_train = traindf.drop([TARGET, 'ID'], axis=1)
y_train = traindf[TARGET]
### test
X_test = testdf.drop([TARGET, 'ID'], axis=1)

del[testdf, traindf]
gc.collect()

0

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# まずは全数で
## model準備
params = {
    'iterations': 1500,
    'learning_rate': 0.1,
    'loss_function': 'RMSE',
    'early_stopping_rounds': 10,
    'random_seed': 0,
    'verbose': 20
    }
model = CatBoostRegressor(**params)

## fit
model.fit(X_train, y_train)

## Feature importance
importances = model.get_feature_importance()

## カラム選択
### 表
feature_names = X_train.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
### 抽出
importance_df = importance_df[importance_df['importance'] != 0]
cols = importance_df['feature']

0:	learn: 0.3343390	total: 3.46s	remaining: 1h 26m 28s
20:	learn: 0.1851880	total: 45.2s	remaining: 53m 2s
40:	learn: 0.1619045	total: 1m 18s	remaining: 46m 40s
60:	learn: 0.1544926	total: 1m 51s	remaining: 44m 1s
80:	learn: 0.1502906	total: 2m 23s	remaining: 41m 57s
100:	learn: 0.1472927	total: 2m 56s	remaining: 40m 40s
120:	learn: 0.1449843	total: 3m 30s	remaining: 39m 56s
140:	learn: 0.1431233	total: 4m 2s	remaining: 39m
160:	learn: 0.1417126	total: 4m 34s	remaining: 38m 3s
180:	learn: 0.1403935	total: 5m 6s	remaining: 37m 14s
200:	learn: 0.1393720	total: 5m 35s	remaining: 36m 7s
220:	learn: 0.1383613	total: 6m 8s	remaining: 35m 30s
240:	learn: 0.1375185	total: 6m 38s	remaining: 34m 42s
260:	learn: 0.1367896	total: 7m 9s	remaining: 33m 58s
280:	learn: 0.1361630	total: 7m 38s	remaining: 33m 9s
300:	learn: 0.1355240	total: 8m 10s	remaining: 32m 34s
320:	learn: 0.1349994	total: 8m 39s	remaining: 31m 47s
340:	learn: 0.1344946	total: 9m 8s	remaining: 31m 4s
360:	learn: 0.1340215	total: 9

In [None]:
# 適用
X_train = X_train[cols]
X_test = X_test[cols]

In [None]:
del[cols, importance_df, importances, model]

In [None]:
gc.collect()

0

### Group method (cate2cate)

Apply statistics of categorical features in different categorical features group. For example, applying "entropy" on the frequency table of "最寄駅：名称" group by "市区町村コード". The statistics functions I used :

- n_distinct : number of unique
- Entropy : apply entropy on frequency table
- freq1name : the number of most frequently appeared category
- freq1ratio : the number of most frequently appeared category / group size

In [None]:
df = df_base.copy()

In [None]:
# categorical、numericなカラム名を所得
## categorical
def find_categorical_columns(df):
    categorical_cols = []
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            categorical_cols.append(col)
        elif pd.api.types.is_object_dtype(df[col]):
            categorical_cols.append(col)
    return categorical_cols


groupcols = find_categorical_columns(df)

In [None]:
from scipy.stats import entropy

for groupcol in groupcols:
  for targetcol in groupcols:
    if groupcol == targetcol:
      continue
    else:
      # Group the dataframe by group_col
      grouped = df.groupby(groupcol)

      # Calculate the statistics
      n_distinct = grouped[targetcol].nunique().reset_index().rename(columns={targetcol: f'{targetcol}_n_distinct_{groupcol}'})
      freq_table = df.groupby([groupcol, targetcol]).size().unstack(fill_value=0)
      ent = freq_table.apply(entropy, axis=1).reset_index().rename(columns={0: f'{targetcol}_entropy_{groupcol}'})
      freq1 = grouped[targetcol].agg(lambda x: x.value_counts().max()).reset_index().rename(columns={targetcol: f'{targetcol}_freq1_{groupcol}'})
      freq1ratio = grouped[targetcol].agg(lambda x: x.value_counts(normalize=True).max()).reset_index().rename(columns={targetcol: f'{targetcol}_freq1ratio_{groupcol}'})

      # Merge the calculated statistics back to the original dataframe
      df = df.merge(n_distinct, on=groupcol, how='left')
      df = df.merge(ent, on=groupcol, how='left')
      df = df.merge(freq1, on=groupcol, how='left')
      df = df.merge(freq1ratio, on=groupcol, how='left')


### Target encoding


In [None]:
def lambdai(x) :
  n = dicn[x]
  k = 5
  f = 1
  return 1/ (1 + math.e ** (- (n-k)/f))

In [None]:
Xtotal = df[TARGET].count()
Ymean = df[TARGET] .mean()

In [None]:
df[groupcols] = df[groupcols].fillna('その他')

groupcols.remove('市区町村名')
groupcols.remove('地区名')

for i in groupcols:

  Xcol = i

  df1 = pd.DataFrame(df.groupby(Xcol)[TARGET].mean())
  df1['count'] = df.groupby(Xcol)[TARGET].count()

  dicn = df.groupby(Xcol)[TARGET].count().to_dict()
  df1['lamb'] = df1.index.map(lambdai)

  df1['result']= df1['lamb'] * (df1[TARGET] / df1['count']) + (1 - df1['lamb'])*(Ymean / Xtotal)

  dic = dict(zip(df1.index, df1['result']))
  df[Xcol + '_encode'] = df[Xcol].map(dic)

  # Count
  dic = dict(zip(df1.index, df1['count']))
  df[Xcol + '_count'] = df[Xcol].map(dic)

In [None]:
for X, U in zip(['市区町村名', '地区名'], ['都道府県名', '市区町村名']):
    Xcol = X
    Upper = U

    df1 = pd.DataFrame(df.groupby(Xcol)[TARGET].mean()).reset_index()
    df1['count'] = df.groupby(Xcol, as_index=False)[TARGET].count()[TARGET]

    dicn = df.groupby(Xcol)[TARGET].count().to_dict()
    df1['lamb'] = df1[Xcol].map(lambdai)

    upper_encode_dict = df[[Xcol, Upper + '_encode']].drop_duplicates().set_index(Xcol).to_dict()[Upper + '_encode']
    df1[Upper + 'en'] = df1[Xcol].map(upper_encode_dict)

    df1['result'] = df1['lamb'] * (df1[TARGET] / df1['count']) + (1 - df1['lamb']) * (df1[Upper + 'en'])

    dic = dict(zip(df1[Xcol], df1['result']))
    df[Xcol + '_encode'] = df[Xcol].map(dic)

    # Count
    dic = dict(zip(df1[Xcol], df1['count']))
    df[Xcol + '_count'] = df[Xcol].map(dic)

In [None]:
del [Xcol, dicn, df1, X, U, Upper, Xtotal, Ymean, lambdai]
del [df_base]
del [groupcol, i,targetcol, targetcols, upper_encode_dict, freq1ratio, freq_table, freq1,ent ]

In [None]:
gc.collect()

0

#### 後半の処理
→357
そんな減ってねーなあ

In [None]:
for col in df.columns:
    if df[col].dtype == 'int64':
        df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')
del col

df = df.drop(['地区名','市区町村名'], axis=1)
df = df.drop(groupcols, axis=1)

traindf = df[:train_len]
testdf = df[train_len:]

del df
gc.collect()

0

In [None]:
# Xとyの分離
## train
X_train2 = traindf.drop([TARGET, 'ID'], axis=1)

## test
X_test2 = testdf.drop([TARGET, 'ID'], axis=1)

del[testdf, traindf]
gc.collect()

0

In [None]:
# まずは全数で
## model準備
params = {
    'iterations': 1500,
    'learning_rate': 0.1,
    'loss_function': 'RMSE',
    'early_stopping_rounds': 10,
    'random_seed': 0,
    'verbose': 20
    }
model = CatBoostRegressor(**params)

## fit
model.fit(X_train2, y_train)

## Feature importance
importances = model.get_feature_importance()

## カラム選択
### 表
feature_names = X_train2.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
### 抽出
importance_df = importance_df[importance_df['importance'] != 0]
cols = importance_df['feature']

0:	learn: 0.3344884	total: 801ms	remaining: 20m
20:	learn: 0.1882683	total: 12.5s	remaining: 14m 39s
40:	learn: 0.1652394	total: 23.5s	remaining: 13m 55s
60:	learn: 0.1567938	total: 34s	remaining: 13m 22s
80:	learn: 0.1519558	total: 43.3s	remaining: 12m 38s
100:	learn: 0.1487110	total: 53.9s	remaining: 12m 26s
120:	learn: 0.1463340	total: 1m 3s	remaining: 12m
140:	learn: 0.1443799	total: 1m 13s	remaining: 11m 46s
160:	learn: 0.1428715	total: 1m 24s	remaining: 11m 41s
180:	learn: 0.1414826	total: 1m 33s	remaining: 11m 24s
200:	learn: 0.1403729	total: 1m 44s	remaining: 11m 13s
220:	learn: 0.1395066	total: 1m 53s	remaining: 10m 56s
240:	learn: 0.1386881	total: 2m 3s	remaining: 10m 45s
260:	learn: 0.1379543	total: 2m 13s	remaining: 10m 33s
280:	learn: 0.1373257	total: 2m 22s	remaining: 10m 19s
300:	learn: 0.1367246	total: 2m 33s	remaining: 10m 9s
320:	learn: 0.1361748	total: 2m 42s	remaining: 9m 57s
340:	learn: 0.1357016	total: 2m 52s	remaining: 9m 45s
360:	learn: 0.1352154	total: 3m 2s	re

In [None]:
# 適用
X_train2 = X_train2[cols]
X_test2 = X_test2[cols]

### まとめと処理

In [None]:
X_train = pd.concat([X_train, X_train2], axis=1)
X_test = pd.concat([X_test, X_test2], axis=1)

del [X_train2, X_test2]

In [None]:
# SKlearn 使うために
## NaN 処理
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

## inf 処理
X_train.replace([np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
def change_dtype(df):
  for col in df.columns:
      if df[col].dtype == 'int64':
          df[col] = df[col].astype('int32')
      elif df[col].dtype == 'float64':
          df[col] = df[col].astype('float32')
  del col

change_dtype(X_train)
change_dtype(X_test)

In [None]:
#X_train.to_csv(path+ 'X_train.csv', index = False)
#X_test.to_csv(path+ 'X_test.csv', index=False)
#y_train.to_csv(path + 'y_train.csv', index=False)

KeyboardInterrupt: ignored

In [None]:
gc.collect()

636

# 分析

### 準備

#### pip
ランタイム再起動するといいことがあるかも。


In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 368, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 368 (delta 76), reused 51 (delta 51), pack-reused 269[K
Receiving objects: 100% (368/368), 101.61 KiB | 4.84 MiB/s, done.
Resolving deltas: 100% (178/178), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 KB 2.2 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has the right kind of GPU, a Tesla T4!
We will now install RAPIDS cuDF, cuML, and cuGraph via pip! 
Please stand by, should be quick...
***********************************************************************

Looking in indexe

In [None]:
!pip install dask==2023.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dask==2023.1.1
  Using cached dask-2023.1.1-py3-none-any.whl (1.1 MB)
Installing collected packages: dask
  Attempting uninstall: dask
    Found existing installation: dask 2023.1.0
    Uninstalling dask-2023.1.0:
      Successfully uninstalled dask-2023.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
raft-dask-cu11 23.2.0 requires distributed==2023.1.1, but you have distributed 2023.3.2 which is incompatible.
distributed 2023.3.2 requires dask==2023.3.2, but you have dask 2023.1.1 which is incompatible.
dask-cudf-cu11 23.2.0 requires distributed==2023.1.1, but you have distributed 2023.3.2 which is incompatible.
dask-cuda 23.2.1 requires distributed==2023.1.1, but you have distributed 2023.3.2 which is incompatible.[0m[31m
[0mSuccessfully i

In [None]:
!pip install dask-ml>2023.1

In [None]:
pip install --upgrade --force-reinstall distributed

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting distributed
  Downloading distributed-2023.3.2-py3-none-any.whl (956 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m956.9/956.9 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tornado>=6.0.3
  Downloading tornado-6.2-cp37-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (423 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m424.0/424.0 KB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sortedcontainers>=2.0.5
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting locket>=1.0.0
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Collecting dask==2023.3.2
  Downloading dask-2023.3.2-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting msgpack

In [None]:
pip install --upgrade dask

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### よみこみ


In [None]:
import cudf
import cuml

In [None]:
print(dask_ml.__version__)

NameError: ignored

In [None]:
from dask_ml.preprocessing import StandardScalar

UnknownExtra: ignored

In [None]:
path = '/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/'

In [None]:
# Read the CSV file with cuDF
csv_file_path = path+'y_train.csv'
y_train = cudf.read_csv(csv_file_path)
X_test = cudf.read_csv(path+'X_test.csv')
X_train = dask_cudf.read_csv(path+'X_train.csv', blocksize='2048MB')

In [None]:
# Standardize the data using Dask-ML's StandardScaler
scaler = dask_ml.preprocessing.StandardScaler()
scaler.fit(X_train)

# Transform the data
X_train_scaled = scaler.transform(X_train)

MemoryError: ignored

In [None]:
gc.collect()

290

In [None]:
y_train = pd.Series(y_train[TARGET])

### 標準化

#### sk

In [None]:
# ndarrayで消されるので保存
X_columns = X_train.columns

In [None]:
# 標準化
## インポート
from sklearn.preprocessing import StandardScaler

## fit
sc = StandardScaler()
sc.fit(X_train)

## 適用
X_train= sc.transform(X_train)
X_test = sc.transform(X_test)

del sc

In [None]:
X_train.dtype

dtype('float32')

In [None]:
# 整形
## ndarrayをDFに
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

## カラム名復活
X_train.columns = X_columns
X_test.columns = X_columns

del [X_columns]

#### cudf

In [None]:
import cuml

# Standardize the data using cuML's StandardScaler
scaler = cuml.preprocessing.StandardScaler()
scaler.fit(X_train)

AttributeError: ignored

## 関数

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
'''
LGBM用。例
Xtrain　DataFrame
ytrain　Series
params = {"objective": "**",
        "metric": "rmse",
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=**, train_size=**, test_size=**, random_state=**)
'''

def lgbmmodel(Xtrain, ytrain, params, rs):
  #スコア保存用(平均出す時に使う)
  scores=[]
  # モデル保存用
  models= []

  for i, (train_index, test_index) in enumerate(rs.split(Xtrain)):
    X_traintrain = Xtrain.iloc[train_index]
    X_traintest  = Xtrain.iloc[test_index]
    y_traintrain = ytrain.iloc[train_index]
    y_traintest  = ytrain.iloc[test_index]

    # lgbm用Dataset
    traintrain_set = lgbm.Dataset(X_traintrain, y_traintrain)
    traintest_set = lgbm.Dataset(X_traintest, y_traintest)

    # 学習
    model = lgbm.train(
        params = params,
        train_set = traintrain_set,
        valid_sets = [traintrain_set, traintest_set],
        num_boost_round=1500,                                       #本番はここ変える
        callbacks=[lgbm.callback.early_stopping(10)]
        )

    # 評価
    models.append(model)
    pred = model.predict(X_traintest)
    score = np.sqrt(mean_squared_error(y_traintest, pred))
    scores.append(score)
    print(score)
  print(np.mean(scores))
  return(models)

In [None]:
'''
例)
Xtrain DataFrame
ytrain Series
model = RandomForestRegressor(n_estimators=**,random_state=**, max_depth=**)
rs = ShuffleSplit(n_splits=5, train_size=**, test_size=**, random_state=**)
'''
def skmodel(Xtrain, ytrain, model, rs):
  #スコア保存用(平均出す時に使う)
  scores=[]
  # モデル保存用
  models= []

  for i, (train_index, test_index) in enumerate(rs.split(Xtrain)):
    X_traintrain = Xtrain.iloc[train_index]
    X_traintest  = Xtrain.iloc[test_index]
    y_traintrain = ytrain.iloc[train_index]
    y_traintest  = ytrain.iloc[test_index]

    #モデル
    model = model
    model.fit(X_traintrain,y_traintrain)
    pred = model.predict(X_traintest)
    score = np.sqrt(mean_squared_error(y_traintest, pred))
    scores.append(score)
    print(score)
    models.append(model)
  print('平均スコア',np.mean(scores))
  return(models)

In [None]:
# objective(obj): str
def applytest(Xtest, models, obj):
  sub = pd.DataFrame([])

  ## 5回分の結果
  for i, model in enumerate(models):
    pred = model.predict(Xtest)
    sub[i] = pred

  ## 平均
  sub['ans'] = sub[[0,1,2,3,4]].mean(axis=1)

  test_st1[obj] = sub['ans']

In [None]:
def applytrain(Xtrain, models, obj):
  sub = pd.DataFrame([])

  ## 5回分の結果
  for i, model in enumerate(models):
    pred = pd.Series(model.predict(Xtrain))
    sub[i] = pred
    print('turn ' + str(i) + ' completed!')

  ## 平均
  sub['ans'] = sub[[0,1,2,3,4]].mean(axis=1)

  train_st1[obj] = sub['ans']

## 1段目

In [None]:
# 一段目の収納先
train_st1 = pd.DataFrame([])
test_st1 = pd.DataFrame([])

### NGBoost

In [None]:
!pip install ngboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ngboost
  Downloading ngboost-0.4.0-py3-none-any.whl (32 kB)
Collecting flake8<6.0.0,>=5.0.4
  Downloading flake8-5.0.4-py2.py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lifelines>=0.25
  Downloading lifelines-0.27.4-py3-none-any.whl (349 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.7/349.7 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting pycodestyle<2.10.0,>=2.9.0
  Downloading pycodestyle-2.9.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.5/41.5 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mccabe<0.8.0,>=0.7.0
  Downloading mccabe-0.7.0-py2.py3-none-any.whl (7.3 kB)
Collecting pyflakes<2.6.0,>=2.5.0
  Downloading pyflakes-2.5.0-py2.py3-none-any.whl (66 kB)
[2K

In [None]:
from ngboost import NGBRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error

def ngboost_model(Xtrain, ytrain, params, rs):
    # スコア保存用 (平均を出すときに使う)
    scores = []
    # モデル保存用
    models = []

    for i, (train_index, test_index) in enumerate(rs.split(Xtrain)):
        X_traintrain = Xtrain.iloc[train_index]
        X_traintest  = Xtrain.iloc[test_index]
        y_traintrain = ytrain.iloc[train_index]
        y_traintest  = ytrain.iloc[test_index]

        # 学習
        model = NGBRegressor(**params)
        model.fit(
            X_traintrain,
            y_traintrain,
            X_traintest,
            y_traintest,
            early_stopping_rounds=10
        )

        # 評価
        models.append(model)
        pred = model.predict(X_traintest)
        score = np.sqrt(mean_squared_error(y_traintest, pred))
        scores.append(score)
        print(score)

    print(np.mean(scores))
    return models

In [None]:
rs = ShuffleSplit(n_splits=5, train_size=0.05, test_size=0.02, random_state=7)
params = {
    'n_estimators': 500,
    #'Base':'DecisionTreeRegressor',
    #'Dist':'Normal',
    #'Score':'RMSE', #この辺怪しそう
    'random_state':0,
    'verbose':True
    }
models = ngboost_model(X_train, y_train, params, rs)

[iter 0] loss=0.3846 val_loss=0.3749 scale=1.0000 norm=0.6248
[iter 100] loss=0.0125 val_loss=0.0138 scale=1.0000 norm=0.4904
[iter 200] loss=-0.1913 val_loss=-0.1832 scale=1.0000 norm=0.4870
[iter 300] loss=-0.3416 val_loss=-0.3223 scale=1.0000 norm=0.4982
[iter 400] loss=-0.4389 val_loss=-0.4047 scale=1.0000 norm=0.5116
0.16808526072431232
[iter 0] loss=0.3850 val_loss=0.3931 scale=1.0000 norm=0.6248
[iter 100] loss=0.0093 val_loss=0.0271 scale=1.0000 norm=0.4897
[iter 200] loss=-0.1958 val_loss=-0.1671 scale=1.0000 norm=0.4850
[iter 300] loss=-0.3506 val_loss=-0.3003 scale=1.0000 norm=0.4940
[iter 400] loss=-0.4484 val_loss=-0.3745 scale=1.0000 norm=0.5067
0.17220343887775227
[iter 0] loss=0.3918 val_loss=0.3799 scale=1.0000 norm=0.6295
[iter 100] loss=0.0170 val_loss=0.0168 scale=1.0000 norm=0.4924
[iter 200] loss=-0.1979 val_loss=-0.1880 scale=1.0000 norm=0.4882
[iter 300] loss=-0.3447 val_loss=-0.3224 scale=1.0000 norm=0.4998
[iter 400] loss=-0.4385 val_loss=-0.4008 scale=1.0000 

In [None]:
#NGBoost の参考
#_ = NGBRegressor().fit(X_reg_train, Y_reg_train, X_val=X_reg_test, Y_val=Y_reg_test, early_stopping_rounds=2)
'''
DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best'), 'minibatch_frac': 1.0
                      '''

In [None]:
def range_pred(traindata, model):
  # 信頼区間の計算
  alpha = 0.05  # 95%信頼区間を計算するためのalpha値
  y_pred = model.predict(traindata)
  y_pred_dist = model.pred_dist(traindata)
  lower = y_pred - y_pred_dist.ppf(1 - alpha/2) * y_pred_dist.params["scale"]
  upper = y_pred + y_pred_dist.ppf(1 - alpha/2) * y_pred_dist.params["scale"]
  return y_pred, lower, upper

In [None]:
#train
sub = pd.DataFrame([])
sub_lower = pd.DataFrame([])
sub_upper = pd.DataFrame([])

for i,model in enumerate(models):
  pred, lower, upper = range_pred(X_train, model)
  sub[i] = pred
  sub_lower[i] = lower
  sub_upper[i] = upper
sub['ans'] = sub[[0,1,2,3,4]].mean(axis=1)
sub_lower['ans'] = sub[[0,1,2,3,4]].mean(axis=1)
sub_upper['ans'] = sub[[0,1,2,3,4]].mean(axis=1)

train_st1['ng'] = sub['ans']
train_st1['ng_lower'] = sub_lower['ans']
train_st1['nu_upper'] = sub_upper['ans']

# test
sub = pd.DataFrame([])
sub_lower = pd.DataFrame([])
sub_upper = pd.DataFrame([])

for i,model in enumerate(models):
  pred, lower, upper = range_pred(X_test, model)
  sub[i] = pred
  sub_lower[i] = lower
  sub_upper[i] = upper
sub['ans'] = sub[[0,1,2,3,4]].mean(axis=1)
sub_lower['ans'] = sub[[0,1,2,3,4]].mean(axis=1)
sub_upper['ans'] = sub[[0,1,2,3,4]].mean(axis=1)

test_st1['ng'] = sub['ans']
test_st1['ng_lower'] = sub_lower['ans']
test_st1['nu_upper'] = sub_upper['ans']

### LGBM

#### lgbm(regression)
Training until validation scores don't improve for 10 rounds

Did not meet early stopping. Best iteration is:

[1500]	training's rmse: 0.105402

valid_1's rmse: 0.128766

0.12876637825003848

Training until validation scores don't improve for 10 rounds


Early stopping, best iteration is:

[1486]	training's rmse: 0.105601	valid_1's rmse: 0.128273

0.12827298624575176
...

In [None]:
params = {"objective": "regression", # 回帰
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }

rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=0)

In [None]:
models = lgbmmodel(X_train, y_train, params, rs)
applytest(X_test, models, 'lgbmregression')
applytrain(X_train, models, 'lgbmregression')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.105402	valid_1's rmse: 0.128766
0.12876637825003848
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1486]	training's rmse: 0.105601	valid_1's rmse: 0.128273
0.12827298624575176
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1488]	training's rmse: 0.105792	valid_1's rmse: 0.129493
0.12949346727240066
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1353]	training's rmse: 0.108237	valid_1's rmse: 0.129681
0.1296808489779545
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.104753	valid_1's rmse: 0.131815
0.13181547445492886
0.12960583104021484
turn 0 completed!
turn 1 completed!
turn 2 completed!
turn 3 completed!
turn 4 comple

#### 今後の特徴量(v_colsに保存)

In [None]:
# 特徴量重要度
cols = list(X_train.columns)
## 算出
f_importance = np.array(models[0].feature_importance())
## 正規化
f_importance = f_importance / np.sum(f_importance)

## カラムと合体
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
## 降順ソート
df_importance = df_importance.sort_values('importance', ascending=False)

display(df_importance[df_importance['importance'] != 0]) #確認
v_cols = df_importance[df_importance['importance'] != 0]['feature']

Unnamed: 0,feature,importance
1,面積（㎡）,0.016711
267,hl_diff_面積（㎡）,0.010356
1048,地区名_建築年_count,0.009822
6,取引時点_enc,0.009600
269,zscore_面積（㎡）,0.009267
...,...,...
107,建築年_最寄駅：距離（分）_var,0.000022
620,今後の利用目的_建ぺい率（％）_min,0.000022
404,都市計画_面積（㎡）_median,0.000022
637,都市計画_建ぺい率（％）_max,0.000022


In [None]:
del [cols]

#### 差分検出(なんとなくRandomForest)

train段階で差がわかっている状態なので、二段目でこの差を足せば終わりじゃん!って思わせちゃうことになる。一旦保留っす。

#### lgbm(huber, 0.06)
[1500]	training's rmse: 0.127917	valid_1's rmse: 0.130952

In [None]:
params = {"objective": "huber", 'alpha': 0.06,
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }

rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=1)

In [None]:
models = lgbmmodel(X_train, y_train, params, rs)
applytest(X_test, models, 'lgbmhuber006')
applytrain(X_train, models, 'lgbmhuber006')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.127116	valid_1's rmse: 0.131306
0.1313055800627917
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.126339	valid_1's rmse: 0.13278
0.1327801389187901
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.126244	valid_1's rmse: 0.133377
0.13337679859694448
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.127107	valid_1's rmse: 0.132353
0.13235336814195106
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.127496	valid_1's rmse: 0.132923
0.1329233775709545
0.13254785265828636
turn 0 completed!
turn 1 completed!
turn 2 comple

#### lgbm(huber. 0.3)
[1500]	training's rmse: 0.114722	valid_1's rmse: 0.126909

In [None]:
params = {"objective": "huber", 'alpha': 0.3,
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=2)

models = lgbmmodel(X_train, y_train, params, rs)

applytest(X_test, models, 'lgbmhuber03')
applytrain(X_train, models, 'lgbmhuber03')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.112615	valid_1's rmse: 0.126475
0.12647455271201152
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1344]	training's rmse: 0.113279	valid_1's rmse: 0.128056
0.1280555529597223
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.112915	valid_1's rmse: 0.127874
0.12787409389641946
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.113565	valid_1's rmse: 0.127757
0.1277565800521629
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1192]	training's rmse: 0.116395	valid_1's rmse: 0.128572
0.12857219948337417
0.12774659582073808
turn 0 completed!
turn 1 completed!
turn 2 completed!
turn 3 completed!
t

#### lgbm(huber, 指定なし(=0.9))
[1500]	training's rmse: 0.1092	valid_1's rmse: 0.127953

In [None]:
params = {"objective": "huber", 'alpha': 0.9,
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=5, train_size=0.3, test_size=0.15, random_state=3)

models = lgbmmodel(X_train, y_train, params, rs)

applytest(X_test, models, 'lgbmhuber09')
applytrain(X_train, models, 'lgbmhuber09')

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1266]	training's rmse: 0.104754	valid_1's rmse: 0.128427
0.12842698140820466
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1208]	training's rmse: 0.106128	valid_1's rmse: 0.133422
0.13342204766653532
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1416]	training's rmse: 0.103892	valid_1's rmse: 0.130245
0.13024481874937385
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1286]	training's rmse: 0.104456	valid_1's rmse: 0.130507
0.1305071736427877
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1285]	training's rmse: 0.103677	valid_1's rmse: 0.134261
0.13426058043087122
0.13137232037955454
turn 0 completed!
turn 1 completed!
turn 2 completed!
turn 3 completed!
turn 4 completed!


#### Huber1.5

In [None]:
params = {"objective": "huber", 'alpha': 1.5,
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=4)

models = lgbmmodel(X_train, y_train, params, rs)

applytest(X_test, models, 'lgbmhuber15')
applytrain(X_train, models, 'lgbmhuber15')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.106013	valid_1's rmse: 0.128771
0.12877133660104026
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1451]	training's rmse: 0.106866	valid_1's rmse: 0.129143
0.1291433749919288
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.106124	valid_1's rmse: 0.128296
0.12829631679656198
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1395]	training's rmse: 0.106852	valid_1's rmse: 0.131284
0.13128366929932594
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.105659	valid_1's rmse: 0.129224
0.12922441939756235
0.12934382341728387
turn 0 completed!
turn 1 completed!
turn 2 completed!
turn 3 completed!


In [None]:
train_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbm.csv', index=False)

In [None]:
test_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbmtest.csv', index=False)

#### lgbm mae
効果薄そうなのでやめた

#### lgbm fair
[1500]	training's rmse: 0.110867	valid_1's rmse: 0.127449

In [None]:
params = {"objective": "fair",
        "metric": "rmse",          # 平均二乗誤差の平方根
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=5)

models = lgbmmodel(X_train, y_train, params, rs)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.108642	valid_1's rmse: 0.129249
0.12924860426329995
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[841]	training's rmse: 0.117865	valid_1's rmse: 0.128831
0.1288312811897211
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.10943	valid_1's rmse: 0.128315
0.1283146882616967
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1365]	training's rmse: 0.11075	valid_1's rmse: 0.127919
0.12791896039756945
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 0.109011	valid_1's rmse: 0.12831
0.12830960243531547
0.12852462730952055


In [None]:
applytest(X_test, models, 'lgbmfair')
applytrain(X_train, models, 'lgbmfair')

turn 0 completed!
turn 1 completed!
turn 2 completed!
turn 3 completed!
turn 4 completed!


In [None]:
train_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbm_tofair_train.csv', index=False)

In [None]:
test_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbm_tofair_test.csv', index=False)

### catboost

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
from catboost import CatBoost
from catboost import Pool
from catboost import CatBoostRegressor

In [None]:
def catmodel(Xtrain, ytrain, params, rs):
  #スコア保存用(平均出す時に使う)
  scores=[]
  # モデル保存用
  models= []

  for i, (train_index, test_index) in enumerate(rs.split(Xtrain)):
    X_traintrain = Xtrain.iloc[train_index]
    X_traintest  = Xtrain.iloc[test_index]
    y_traintrain = ytrain.iloc[train_index]
    y_traintest  = ytrain.iloc[test_index]

    # cat用Dataset
    traintrain_pool = Pool(X_traintrain, label=y_traintrain)
    traintest_pool = Pool(X_traintest, label=y_traintest)

    # 学習
    model = CatBoostRegressor(**params)
    model.fit(traintrain_pool, eval_set=[traintrain_pool, traintest_pool])
    models.append(model)

    # 評価
    pred = model.predict(X_traintest)
    score = np.sqrt(mean_squared_error(y_traintest, pred))
    scores.append(score)
    print(score)
  print(np.mean(scores))
  return(models)

#### CatBoost RMSE
RAM食わなくて嬉しい。

test: 0.1221138	test1: 0.1291477

In [None]:
rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=0.2, random_state=7)

params = {'loss_function': 'RMSE',
        'num_boost_round': 1500,  #変える!
        'early_stopping_rounds': 10,
        'random_seed': 0
        }

In [None]:
models = catmodel(X_train, y_train, params, rs)
applytest(X_test, models, 'catregression')
applytrain(X_train, models, 'catregression')

Learning rate set to 0.100866
0:	learn: 0.3336928	test: 0.3336928	test1: 0.3345831	best: 0.3345831 (0)	total: 1.28s	remaining: 32m 1s
1:	learn: 0.3143544	test: 0.3143544	test1: 0.3152904	best: 0.3152904 (1)	total: 2.22s	remaining: 27m 43s
2:	learn: 0.2975978	test: 0.2975978	test1: 0.2985529	best: 0.2985529 (2)	total: 3.1s	remaining: 25m 49s
3:	learn: 0.2832835	test: 0.2832835	test1: 0.2842916	best: 0.2842916 (3)	total: 3.87s	remaining: 24m 8s
4:	learn: 0.2705836	test: 0.2705836	test1: 0.2715903	best: 0.2715903 (4)	total: 4.79s	remaining: 23m 51s
5:	learn: 0.2594008	test: 0.2594008	test1: 0.2604028	best: 0.2604028 (5)	total: 5.67s	remaining: 23m 33s
6:	learn: 0.2494235	test: 0.2494235	test1: 0.2504087	best: 0.2504087 (6)	total: 6.48s	remaining: 23m 2s
7:	learn: 0.2405466	test: 0.2405466	test1: 0.2415560	best: 0.2415560 (7)	total: 7.3s	remaining: 22m 40s
8:	learn: 0.2330738	test: 0.2330738	test1: 0.2341087	best: 0.2341087 (8)	total: 8.06s	remaining: 22m 14s
9:	learn: 0.2261444	test: 0.22

#### Catboost(Huber)

deltaの設定方法わかんなくて諦め

https://catboost.ai/en/docs/concepts/loss-functions

これだと思うんだけどお？

In [None]:
'''
params = {'loss_function': 'Huber':delta=0.06,
        'num_boost_round': 1500,
        'early_stopping_rounds': 10,
        'random_seed': 0,
        #'ignored_features': 'ID'
        }
# ID無視したい。

model = CatBoost(params)
model.fit(traintrain_pool, eval_set=[traintest_pool])
'''

### Randomforest

In [None]:
X_train = X_train[v_cols]
X_test = X_test[v_cols]

#### Depth = 6

In [None]:
model = RandomForestRegressor(n_estimators=15, random_state=1, max_depth=6)
rs = ShuffleSplit(n_splits=5, train_size=0.4, test_size=0.2, random_state=51)

models = skmodel(X_train, y_train, model, rs)

applytest(X_test, models, 'randomforest6')
applytrain(X_train, models, 'randomforest6')

#### Depth = 13

In [None]:
model = RandomForestRegressor(n_estimators=10, random_state=3, max_depth=13)
rs = ShuffleSplit(n_splits=5, train_size=0.3, test_size=0.2, random_state=52)

models = skmodel(X_train, y_train, model, rs)

applytest(X_test, models, 'randomforest13')
applytrain(X_train, models, 'randomforest13')

In [None]:
train_st1.head()

#### depth=15

In [None]:
model = RandomForestRegressor(n_estimators=10, random_state=3, max_depth=15)
rs = ShuffleSplit(n_splits=5, train_size=0.3, test_size=0.2, random_state=53)

models = skmodel(X_train, y_train, model, rs)

applytest(X_test, models, 'randomforest15')
applytrain(X_train, models, 'randomforest15')

#### depth=20

In [None]:
model = RandomForestRegressor(n_estimators=10, random_state=3, max_depth=20)
rs = ShuffleSplit(n_splits=5, train_size=0.3, test_size=0.2, random_state=54)

models = skmodel(X_train, y_train, model, rs)

applytest(X_test, models, 'randomforest20')
applytrain(X_train, models, 'randomforest20')

In [None]:
train_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/cat_randomforest_train.csv', index=False)

In [None]:
test_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/cat_randomforest_test.csv', index=False)

### その他

#### 重回帰

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
rs = ShuffleSplit(n_splits=5, train_size=0.8, test_size=0.2, random_state=10)

models = skmodel(X_train, y_train, model, rs)

In [None]:
applytest(X_test, models, 'linearregression')
applytrain(X_train, models, 'linearregression')

#### Kmeans

In [None]:
from sklearn.cluster import KMeans

model = KMeans(random_state=0)
rs = ShuffleSplit(n_splits=5, train_size=0.6, test_size=0.2, random_state=13)

models = skmodel(X_train, y_train, model, rs)

In [None]:
applytest(X_test, models, 'kmeans')
applytrain(X_train, models, 'kmeans')

#### K近傍
applyに時間かかりすぎな件、SKIP.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=3)
rs = ShuffleSplit(n_splits=5, train_size=0.6, test_size=0.2, random_state=11)

models = skmodel(X_train, y_train, model, rs)

In [None]:
models

In [None]:
applytest(X_test, models, 'kneighbor')
applytrain(X_train, models, 'kneighbor')

#### Ridge

In [None]:
from sklearn.linear_model import Ridge

model = Ridge(random_state = 0)
rs = ShuffleSplit(n_splits=5, train_size=0.6, test_size=0.3, random_state=12)

models = skmodel(X_train, y_train, model, rs)

In [None]:
applytest(X_test, models, 'ridge')
applytrain(X_train, models, 'ridge')

In [None]:
test_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/sonota_test.csv', index=False)

In [None]:
train_st1.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/sonota_train.csv', index=False)

## 二段目

In [None]:
X_train_1 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbm_tofair_train.csv')
X_test_1 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/lgbm_tofair_test.csv')
X_train_2 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/cat_randomforest_train.csv')
X_test_2 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/cat_randomforest_test.csv')
X_train_3 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/sonota_train.csv')
X_test_3 = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/sonota_test.csv')

In [None]:
X_train = pd.concat([X_train_1, X_train_2.drop(['lgbmregression'],axis = 1), X_train_3.drop(['lgbmregression'],axis = 1)], axis=1)


In [None]:
X_test = pd.concat([X_test_1, X_test_2.drop(['lgbmregression'],axis = 1), X_test_3.drop(['lgbmregression'],axis = 1)], axis=1)

In [None]:
X_train

In [None]:
'''
LGBM用。例
Xtrain　DataFrame
ytrain　Series
params = {"objective": "**",
        "metric": "rmse",
        "verbosity": -1,   #warningを出力しない
        "random_state": 0
        }
rs = ShuffleSplit(n_splits=**, train_size=**, test_size=**, random_state=**)
'''
params = {"objective": "regression",
        "metric": "rmse",
        "verbosity": -1,   #warningを出力しない
        "random_state": 7
        }
rs = ShuffleSplit(n_splits=5, train_size=0.7, test_size=0.3, random_state=7)

models = lgbmmodel(X_train, y_train, params, rs)

In [None]:
sub = pd.DataFrame([])

for i, model in enumerate(models):
  pred=model.predict(X_test)
  sub[i] = pred

sub['ans'] = sub[[0,1,2,3,4]].mean(axis=1)

In [None]:
# 整形
test_df = pd.read_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/test.csv')
sub0319 = pd.DataFrame(test_df['ID'])
sub0319[TARGET] = sub['ans']

# 出力
sub0319.to_csv('/content/drive/MyDrive/2023_nishika/2023_01_Nishika_estate/sub0319.csv', index=False)

# 保管

In [None]:
#lgbmでのモデル保存
# Save the model
# model.save_model('model.txt')