In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pandas.tseries.offsets import DateOffset
import unicodedata, re
import os

# 読み込み #

In [2]:
# === 入出力ディレクトリ設定 ===
# 入力は project_root/inputs
INPUT_DIR = os.path.join(os.getcwd(), "input")
# 出力は project_root/outputs
OUTPUT_DIR = os.path.join(os.getcwd(), "output")

# ディレクトリが無ければ作成
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("INPUT_DIR:", INPUT_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)



INPUT_DIR: /Users/okada1015/Desktop/マリサ/marisa/input
OUTPUT_DIR: /Users/okada1015/Desktop/マリサ/marisa/output


In [3]:
# 1. データ読み込み（必要に応じて dtype や parse_dates 指定）
df_reports = pd.read_csv(
    os.path.join(INPUT_DIR, "ASSESSMENT_REPORTS.csv"),
    parse_dates=['ASSESSED_ON']
)

df_deal_mansions = pd.read_csv(
    os.path.join(INPUT_DIR, "DEAL_MANSIONS.csv"),
    parse_dates=['ADDED_ON']
)

df_deal_lots = pd.read_csv(
    os.path.join(INPUT_DIR, "DEAL_LOTS.csv"),
    parse_dates=['ADDED_ON']
)

df_apartments = pd.read_csv(
    os.path.join(INPUT_DIR, "APARTMENTS.csv")
)

# 1. データ読み込み（必要に応じて dtype や parse_dates 指定）
df_reports_all = pd.read_csv(
    os.path.join(OUTPUT_DIR, "1.ASSESSMENT_REPORTS_MERGE.csv"),
    parse_dates=['ASSESSED_ON']
)


  df_reports = pd.read_csv(
  df_reports_all = pd.read_csv(


# 中身の確認 #

In [4]:
print(df_reports.shape)
print(df_deal_mansions.shape)
print(df_deal_lots.shape)
print(df_apartments.shape)

(338003, 307)
(2966708, 5)
(6653677, 5)
(145146, 47)


In [5]:
pd.set_option('display.max_columns', None)
print(df_deal_mansions.columns.to_list())

['C_ID', 'FLOOR_NUMBER', 'OWNER_SPACE', 'ADDED_ON', 'APARTMENT_NAME']


In [6]:
df_deal_lots.columns.to_list()

['PROPERTY_KIND', 'C_ID', 'LAND_SPACE', 'BUILDING_SPACE', 'ADDED_ON']

In [7]:
pd.set_option('display.max_columns', None)
print(df_apartments.columns.to_list())

['ID', 'PARENT_ID', 'PARENT', 'TYPE', 'NAME', 'NAME_EN', 'NAME_KANA', 'YAHOO_NAME', 'DISPLAY_NAME', 'SEARCH_NAME', 'APARTMENT_BRAND_ID', 'P_ID', 'M_ID', 'T_ID', 'C_ID', 'ADDRESS_NAME', 'OTHER_ADDRESS', 'BUILD_YEAR', 'BUILD_MONTH', 'GROUND_FLOOR', 'UNDERGROUND_FLOOR', 'UNIT_COUNT', 'STRUCTURE', 'IS_ISOLATION', 'IS_RESISTANCE', 'OWNERSHIP', 'MIN_SPACE', 'MAX_SPACE', 'AVG_SPACE', 'LAND_SPACE', 'BUILD_SPACE', 'LAT', 'LON', 'STREET_VIEW', 'USAGE_AREA1', 'USAGE_AREA2', 'NEW_CONSTRUCTION', 'STATUS', 'VERIFICATION', 'IS_APPROVING', 'NAME_UNMATCHED', 'GOOGLE_SIMILAR_NAME_KIND', 'GOOGLE_SIMILAR_NAME', 'GOOGLE_SIMILAR_NAME_GAINED', 'APARTMENT_KIND', 'CREATED', 'MODIFIED']


In [8]:
df_deal_mansions.head(5)

Unnamed: 0,C_ID,FLOOR_NUMBER,OWNER_SPACE,ADDED_ON,APARTMENT_NAME
0,14108015001,7.0,75.26,2025-01-25,パークシティ金沢八景　Ａ棟
1,11107011002,7.0,134.83,2025-01-26,ヴィアーレ浦和
2,13103024004,3.0,21.52,2025-01-26,パレ・ドール高輪
3,13105010004,7.0,30.31,2025-01-26,ベルジェンド文京千駄木ノーブル
4,13102035003,12.0,63.08,2025-01-04,プロスタイル日本橋馬喰町


In [9]:
df_deal_lots.head(5)

Unnamed: 0,PROPERTY_KIND,C_ID,LAND_SPACE,BUILDING_SPACE,ADDED_ON
0,1,36403005104,563.0,,2017-11-08
1,2,23229005113,139.35,105.8,2017-11-08
2,2,1578001000,304.54,102.27,2017-11-08
3,2,40217051003,127.92,96.05,2017-11-08
4,2,1204195022,141.8,141.2,2017-11-08


In [10]:
df_apartments.head(5)

Unnamed: 0,ID,PARENT_ID,PARENT,TYPE,NAME,NAME_EN,NAME_KANA,YAHOO_NAME,DISPLAY_NAME,SEARCH_NAME,APARTMENT_BRAND_ID,P_ID,M_ID,T_ID,C_ID,ADDRESS_NAME,OTHER_ADDRESS,BUILD_YEAR,BUILD_MONTH,GROUND_FLOOR,UNDERGROUND_FLOOR,UNIT_COUNT,STRUCTURE,IS_ISOLATION,IS_RESISTANCE,OWNERSHIP,MIN_SPACE,MAX_SPACE,AVG_SPACE,LAND_SPACE,BUILD_SPACE,LAT,LON,STREET_VIEW,USAGE_AREA1,USAGE_AREA2,NEW_CONSTRUCTION,STATUS,VERIFICATION,IS_APPROVING,NAME_UNMATCHED,GOOGLE_SIMILAR_NAME_KIND,GOOGLE_SIMILAR_NAME,GOOGLE_SIMILAR_NAME_GAINED,APARTMENT_KIND,CREATED,MODIFIED
0,4219670,,0,,アーバンパレス別府パークビュー,アーバンパレス別府パークビュー,あーばんぱれすべっぷぱーくびゅー,アーバンパレス別府パークビュー,アーバンパレス別府パークビュー,"アーバンパレス別府パークビュー,あーばんぱれすべっぷぱーくびゅー",,40,40136,40136025,40136025004,福岡県福岡市城南区別府４丁目,２ー３２,2019,1,9,,57,4,0,0,1,70.44,70.44,70.44,,,33.573203,130.363907,"{""Da"":130.36421416360884,""Ea"":33.5734766644747...",11.0,,0,1,0,0,0,,,,1,2025-01-28 15:24:47.000,2025-06-15 00:44:24.000
1,4219671,,0,,ベラジオ京都円町,ベラジオ京都円町,べらじおきょうとえんまち,ベラジオ京都円町,ベラジオ京都円町,"ベラジオ京都円町,べらじおきょうとえんまち",,26,26102,26102070,26102070000,京都府京都市上京区大宮町,４７８,2020,5,5,,36,4,0,0,1,,,,,,35.021304,135.735554,"{""Da"":135.7358080682373,""Ea"":35.02116509593455...",12.0,,0,1,0,0,0,,,,1,2025-01-28 15:49:37.000,2025-06-30 00:43:05.000
2,4219672,,0,,ベラジオ西陣聚楽,ベラジオ西陣聚楽,べらじおにしじんじゅらく,ベラジオ西陣聚楽,ベラジオ西陣聚楽,"ベラジオ西陣聚楽,べらじおにしじんじゅらく",,26,26102,26102464,26102464000,京都府京都市上京区百万遍町,９４,2012,3,7,,25,4,0,0,1,27.3,29.63,28.8533,,,35.023632,135.74267,"{""Da"":135.74251612560104,""Ea"":35.0235335583649...",5.0,,0,1,0,0,0,,,,1,2025-01-28 16:08:18.000,2025-01-28 16:08:20.000
3,4219673,,0,,鈴蘭泉台第2住宅4号棟,鈴蘭泉台第２住宅４号棟,すずらんいずみだいだいにじゅうたくよんごうとう,鈴蘭泉台第2住宅4号棟,鈴蘭泉台第2住宅4号棟鈴蘭泉台第2住宅4号棟,"鈴蘭泉台第2住宅4号棟,鈴蘭泉台第２住宅４号棟,すずらんいずみだいだいにじゅうたくよんごうと...",,28,28109,28109007,28109007006,兵庫県神戸市北区泉台６丁目,７－４,1989,1,4,,12,4,0,0,1,,,,,,34.735677,135.132179,"{""Da"":135.1317304853855,""Ea"":34.73634901553597...",11.0,,0,1,0,0,0,,,,1,2025-01-28 18:19:36.000,2025-01-28 18:19:36.000
4,4219674,,0,,プレサンス京都北野白梅町,PRESSANCE京都北野白梅町,ぷれさんすきょうときたのはくばいちょう,プレサンス京都北野白梅町,プレサンス京都北野白梅町,"プレサンス京都北野白梅町,PRESSANCE京都北野白梅町,ぷれさんすきょうときたのはくばい...",,26,26102,26102120,26102120000,京都府京都市上京区北伊勢殿構町,６７９,2022,8,7,,43,4,0,0,1,,,,,,35.026685,135.742592,"{""Da"":135.74243162117088,""Ea"":35.0265228390419...",5.0,,0,1,0,0,1,,,,1,2025-01-29 10:07:54.000,2025-07-29 00:38:38.000


### df_deal_mansions(マンションの取引事例)の重複確認 ###

In [11]:
value_counts = df_deal_mansions['C_ID'].value_counts()
print(value_counts)

C_ID
13108001001    3257
13103011004    2981
13108017001    2749
14104092000    2535
13103009004    2524
               ... 
20201002000       1
4102030003        1
28106059001       1
21201329000       1
26209032001       1
Name: count, Length: 43823, dtype: int64


In [12]:
df_deal_mansions[df_deal_mansions["C_ID"] == 13103011004].head(10)

Unnamed: 0,C_ID,FLOOR_NUMBER,OWNER_SPACE,ADDED_ON,APARTMENT_NAME
1236,13103011004,6.0,91.41,2025-02-01,キャピタルマークタワー
2033,13103011004,10.0,85.14,2025-06-04,クラッシィハウス芝浦
2043,13103011004,34.0,88.33,2025-06-04,芝浦アイランドケープタワー
3588,13103011004,8.0,76.22,2024-10-26,キャピタルマークタワー
4029,13103011004,43.0,51.08,2024-10-26,芝浦アイランドグローヴタワー
6517,13103011004,3.0,16.5,2021-06-25,東京ベイビュウ
6941,13103011004,3.0,21.84,2022-10-06,東京ベイサイド
8891,13103011004,4.0,78.58,2025-02-17,プラウドタワー芝浦
9771,13103011004,5.0,33.74,2025-02-27,東京ベイサイド
11460,13103011004,27.0,71.25,2024-10-31,キャピタルマークタワー


### df_apartments(マンション取引事例のC_ID紐付け用)の重複確認 ###

In [13]:
value_counts = df_apartments['C_ID'].value_counts()
print(value_counts)

C_ID
13204006003    98
14110026000    93
14109035000    87
13209038001    85
14110021000    71
               ..
23112144002     1
23108011001     1
47205029001     1
23108013003     1
27141021004     1
Name: count, Length: 35270, dtype: int64


In [14]:
df_apartments[df_apartments["C_ID"] == 13103011004].head(5)

Unnamed: 0,ID,PARENT_ID,PARENT,TYPE,NAME,NAME_EN,NAME_KANA,YAHOO_NAME,DISPLAY_NAME,SEARCH_NAME,APARTMENT_BRAND_ID,P_ID,M_ID,T_ID,C_ID,ADDRESS_NAME,OTHER_ADDRESS,BUILD_YEAR,BUILD_MONTH,GROUND_FLOOR,UNDERGROUND_FLOOR,UNIT_COUNT,STRUCTURE,IS_ISOLATION,IS_RESISTANCE,OWNERSHIP,MIN_SPACE,MAX_SPACE,AVG_SPACE,LAND_SPACE,BUILD_SPACE,LAT,LON,STREET_VIEW,USAGE_AREA1,USAGE_AREA2,NEW_CONSTRUCTION,STATUS,VERIFICATION,IS_APPROVING,NAME_UNMATCHED,GOOGLE_SIMILAR_NAME_KIND,GOOGLE_SIMILAR_NAME,GOOGLE_SIMILAR_NAME_GAINED,APARTMENT_KIND,CREATED,MODIFIED
2416,751,,0,,芝浦アイランドケープタワー,芝浦アイランドケープタワー,しばうらあいらんどけーぷたわー,芝浦アイランドケープタワー,芝浦アイランドケープタワー,"芝浦アイランドケープタワー,しばうらあいらんどけーぷたわー",,13,13103,13103011,13103011004,東京都港区芝浦４丁目,１９－１,2006,10,48,1.0,1095,4,0,0,1,44.1,167.25,,16908.83,,35.637375,139.750283,"{""Da"":139.75133000000005,""Ea"":35.63554,""headin...",3.0,,0,1,0,0,0,,,,1,2017-08-03 10:41:33.000,2025-07-29 23:17:38.000
3429,1918,,0,,芝浦アイランドグローヴタワー,芝浦アイランドグローヴタワー,しばうらあいらんどぐろーぶたわー,芝浦アイランドグローヴタワー,芝浦アイランドグローヴタワー,"芝浦アイランドグローヴタワー,しばうらあいらんどぐろーぶたわー",,13,13103,13103011,13103011004,東京都港区芝浦４丁目,２１－１,2006,11,49,1.0,833,4,0,0,1,47.06,184.31,,12595.6,,35.640634,139.751225,"{""Da"":139.75079200000005,""Ea"":35.641464,""headi...",3.0,2.0,0,1,0,0,0,,,,1,2017-08-03 10:41:33.000,2025-07-25 23:19:16.000
3441,1932,,0,,三田ナショナルコート,三田ナショナルコート,みたなしょなるこーと,三田ナショナルコート,三田ナショナルコート,"三田ナショナルコート,みたなしょなるこーと",,13,13103,13103011,13103011004,東京都港区芝浦４丁目,４－２７,1982,1,15,,414,5,0,0,1,51.3,99.6,67.4137,,,35.639181,139.744516,"{""Da"":139.7446281940119,""Ea"":35.64067115080474...",6.0,,0,1,0,0,1,,,,1,2017-08-03 10:41:33.000,2025-07-30 23:11:47.000
3445,1937,,0,,キャピタルマークタワー,キャピタルマークタワー,きゃぴたるまーくたわー,キャピタルマークタワー,キャピタルマークタワー,"キャピタルマークタワー,きゃぴたるまーくたわー",,13,13103,13103011,13103011004,東京都港区芝浦４丁目,１０－１,2007,11,47,1.0,869,4,0,0,1,42.79,171.54,,11276.21,,35.641206,139.746378,"{""Da"":139.746042,""Ea"":35.641898,""heading"":-201...",6.0,,0,1,0,0,0,,,,1,2017-08-03 10:41:33.000,2025-07-29 23:18:57.000
3467,1963,,0,,メゾン田町,メゾン田町,めぞんたまち,メゾン田町,メゾン田町,"メゾン田町,めぞんたまち",,13,13103,13103011,13103011004,東京都港区芝浦４丁目,８－１２,1983,12,11,,186,5,0,0,1,40.18,73.62,,,,35.639204,139.746572,"{""Da"":139.74723199999994,""Ea"":35.639409,""headi...",6.0,,0,1,0,0,0,,,,1,2017-08-03 10:41:33.000,2025-06-30 23:04:39.000


# C_IDの欠損率の確認 #

In [15]:
#欠損数・欠損率・欠損有無を確認する関数
def check_missing_info(df, columns=None):
    """
    Parameters:
    ----------
    df : 対象のデータフレーム
    columns : チェック対象のカラムリスト。None の場合は全カラム。
    ----------
    """
    # カラム指定がある場合のみ抽出
    target_df = df[columns] if columns is not None else df

    # 欠損情報の集計
    missing_info = target_df.isnull().sum().to_frame(name='欠損数')
    missing_info['全体件数'] = len(df)
    missing_info['欠損率(%)'] = (missing_info['欠損数'] / missing_info['全体件数'] * 100).round(2)
    missing_info['欠損有無'] = missing_info['欠損数'] > 0
    missing_info = missing_info[['欠損数', '全体件数', '欠損率(%)', '欠損有無']]

    return missing_info

In [16]:
with pd.option_context('display.max_rows', None):
    display(check_missing_info(df_deal_mansions))

Unnamed: 0,欠損数,全体件数,欠損率(%),欠損有無
C_ID,0,2966708,0.0,False
FLOOR_NUMBER,34902,2966708,1.18,True
OWNER_SPACE,0,2966708,0.0,False
ADDED_ON,0,2966708,0.0,False
APARTMENT_NAME,68055,2966708,2.29,True


In [17]:
with pd.option_context('display.max_rows', None):
    display(check_missing_info(df_deal_lots))

Unnamed: 0,欠損数,全体件数,欠損率(%),欠損有無
PROPERTY_KIND,0,6653677,0.0,False
C_ID,0,6653677,0.0,False
LAND_SPACE,0,6653677,0.0,False
BUILDING_SPACE,3249130,6653677,48.83,True
ADDED_ON,0,6653677,0.0,False


# 前処理 #

## assessment_reportsの絞り込み ##

In [18]:
# ASSESSED_ON を日付型に変換
df_reports['ASSESSED_ON'] = pd.to_datetime(df_reports['ASSESSED_ON'], errors='coerce')

# 期間を設定
start_date = pd.to_datetime("2023-02-01")
end_date = pd.to_datetime("2025-01-31")

# 上記の期間に該当するデータを抽出
df_reports_recent = df_reports[
    (df_reports['ASSESSED_ON'] >= start_date) &
    (df_reports['ASSESSED_ON'] <= end_date)
].copy()

#['IS_DELETED'] == 0のデータに絞る
df_reports_recent = df_reports_recent[df_reports_recent['IS_DELETED'] == 0]

#['OWNER_SPACE'] >= 40のデータに絞る
df_reports_mansion = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 1]
print(df_reports_mansion.shape)
df_reports_land = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 2]
print(df_reports_land.shape)
df_reports_house = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 3]
print(df_reports_house.shape)

df_reports_mansion['OWNER_SPACE_NUM'] = (
    df_reports_mansion['OWNER_SPACE']
    .astype(str)
    .str.extract(r'([0-9]+(?:\.[0-9]+)?)')
    .astype(float)
)
# OWNER_SPACE_NUM >= 40 でフィルタ
df_reports_mansion = df_reports_mansion[df_reports_mansion['OWNER_SPACE_NUM'] >= 40]

print(df_reports_mansion.shape)

df_reports_recent = pd.concat([df_reports_mansion, df_reports_land, df_reports_house], ignore_index=True)

# 件数確認
print("全件:", df_reports.shape)
print("2023/2/1～2025/1/31:", df_reports_recent.shape)

(125587, 307)
(24658, 307)
(69099, 307)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_mansion['OWNER_SPACE_NUM'] = (


(113031, 308)
全件: (338003, 307)
2023/2/1～2025/1/31: (206788, 308)


## dealの絞り込み ##

In [19]:
df_deal_mansions[' ADDED_ON'] = pd.to_datetime(df_deal_mansions['ADDED_ON'], errors='coerce')
df_deal_lots['ADDED_ON'] = pd.to_datetime(df_deal_lots['ADDED_ON'], errors='coerce')

# 期間を設定
start_date = pd.to_datetime("2023-02-01")
end_date = pd.to_datetime("2025-07-31")

# 上記の期間に該当するデータを抽出(マンション)
df_deal_mansions = df_deal_mansions[
    (df_deal_mansions['ADDED_ON'] >= start_date) &
    (df_deal_mansions['ADDED_ON'] <= end_date)
].copy()
print(df_deal_mansions.shape)

# 上記の期間に該当するデータを抽出（土地・戸建て）
df_deal_lots = df_deal_lots[
    (df_deal_lots['ADDED_ON'] >= start_date) &
    (df_deal_lots['ADDED_ON'] <= end_date)
].copy()
print(df_deal_lots.shape)


(627140, 6)
(1715480, 5)


# 媒介契約締結フラグの作成 #

In [20]:

# PROPERTY_KIND = 1（マンション） のフィルタ
df_reports_mansion = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 1]
print(f"✅ step2 PROPERTY_KIND == 1（マンション）の件数: {df_reports_mansion.shape[0]}")

# PROPERTY_KIND = 2（土地） のフィルタ
df_reports_land = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 2]
print(f"✅ step2 PROPERTY_KIND == 2（土地）の件数: {df_reports_land.shape[0]}")

# PROPERTY_KIND = 3（戸建） のフィルタ
df_reports_house = df_reports_recent[df_reports_recent['PROPERTY_KIND'] == 3]
print(f"✅ step2 PROPERTY_KIND == 3（戸建）の件数: {df_reports_house.shape[0]}")

print("合計が合うか確認",len(df_reports_mansion)+len(df_reports_land)+len(df_reports_house))

✅ step2 PROPERTY_KIND == 1（マンション）の件数: 113031
✅ step2 PROPERTY_KIND == 2（土地）の件数: 24658
✅ step2 PROPERTY_KIND == 3（戸建）の件数: 69099
合計が合うか確認 206788


## マンション ##

### C_IDを付与 ###

In [21]:
#数値や文字列の正規化関数

def to_num_series_exact(s: pd.Series) -> pd.Series:
    """文字列→数値抽出→小数第2位で丸め（完全一致用）"""
    s = s.astype(str).map(lambda x: unicodedata.normalize('NFKC', x).strip())
    s = s.str.replace(',', '', regex=False)
    return pd.to_numeric(s.str.extract(r'([0-9]+\.?[0-9]*)')[0], errors='coerce').round(2)

def normalize_cid_keep_zeros(s: pd.Series,
                             *, drop_spaces: bool=True, keep_hyphen: bool=True,
                             case: str="upper") -> pd.Series:
    """C_ID正規化：全角→半角/trim/大小統一、空白削除(既定)、ハイフン保持(既定)、
       末尾 .0 を除去、先頭ゼロは保持
    """
    def _norm(x):
        t = unicodedata.normalize("NFKC", str(x)).strip()
        if case == "upper":   t = t.upper()
        elif case == "lower": t = t.lower()
        if drop_spaces:       t = t.replace(" ", "")
        if not keep_hyphen:   t = t.replace("-", "")
        t = re.sub(r"\.0+$", "", t)   # '13107009004.0' -> '13107009004'
        return t
    return s.map(_norm)


def normalize_text(s: pd.Series) -> pd.Series:
    """物件名などの表記ゆれを軽減（全角→半角、前後空白削除）"""
    return s.astype(str).map(lambda x: unicodedata.normalize('NFKC', x).strip())

In [22]:
# C_ID削除（マンションは全てNaNのため）
df_reports_mansion.drop(columns=['C_ID'], errors='ignore', inplace=True)
df_apartments = df_apartments.rename(columns={'ID':'APARTMENT_KEY_ID'})

# apartmentsからC_IDを結合
df_reports_mansion = df_reports_mansion.merge(
    df_apartments[['APARTMENT_KEY_ID', 'C_ID']],
    left_on='APARTMENT_ID',
    right_on='APARTMENT_KEY_ID',
    how='left'
)

print(df_reports_mansion.shape)

df_reports_mansion.drop(columns=['APARTMENT_KEY_ID'], errors='ignore', inplace=True)

# C_IDの存在確認・変換
df_reports_mansion = df_reports_mansion[df_reports_mansion['C_ID'].notna()]
df_reports_mansion['C_ID'] = df_reports_mansion['C_ID'].astype('int64')

print(f"✅ フィルタ済み df_reports_mansion 作成完了（最終件数: {df_reports_mansion.shape[0]}）")
print(df_reports_mansion.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_mansion.drop(columns=['C_ID'], errors='ignore', inplace=True)


(113031, 309)
✅ フィルタ済み df_reports_mansion 作成完了（最終件数: 113031）
(113031, 308)


In [23]:
df_reports_mansion.columns.tolist()

['ID',
 'USER_ID',
 'IS_DEAL',
 'PROPERTY_KIND',
 'APARTMENT_ID',
 'FLOOR_NUMBER',
 'OWNER_SPACE',
 'LAND_SPACE',
 'BUILDING_SPACE',
 'LAND_AND_BUILDING_SPACE',
 'DIRECTION',
 'ASSESSED_ON',
 'CUSTOMER_NAME',
 'PROPERTY_NAME',
 'STAFF_NAME',
 'COMPANY_DESCRIPTION',
 'EMAIL',
 'TEL',
 'LINE_URL',
 'HOMEPAGE',
 'COVER_ASSESSED_ON',
 'COVER_TITLE',
 'TITLE_FONT_SIZE',
 'COVER_CUSTOMER_NAME',
 'COVER_STAFF_AND_COMPANY_NAME',
 'COVER_DESCRIPTION',
 'COVER_EMAIL',
 'COVER_TEL',
 'COVER_HOMEPAGE',
 'COVER_STAFF_ROUND_IMAGE',
 'COVER_STAFF_ROUND_IMAGE_COMMENT',
 'ABOUT_PROPERTY_NAME',
 'ABOUT_ADDRESS',
 'LATITUDE',
 'LONGITUDE',
 'ABOUT_TRAFFIC',
 'ABOUT_MANAGER',
 'ABOUT_LAND_PRIVILEGE',
 'ABOUT_REGISTRY_GROUND',
 'ABOUT_CITY_PLAN',
 'ABOUT_TERRAIN',
 'ABOUT_LAND_SHAPE',
 'ABOUT_SETBACK_SPACE',
 'ABOUT_CURRENT_STATE',
 'ABOUT_RECONSTRUCT_DISABLED',
 'ABOUT_USAGE_AREA',
 'ABOUT_STRUCTURE',
 'STRUCTURE_ID',
 'ABOUT_BUILT_IN',
 'ABOUT_FLOOR',
 'ABOUT_BUILDING_COVERAGE',
 'ABOUT_CONNECT_ROAD_STAT

### フラグ作成 ###

In [24]:
# 数値化・文字列整形
df_reports_mansion['OWNER_SPACE_NUM'] = to_num_series_exact(df_reports_mansion['OWNER_SPACE'])
df_deal_mansions['OWNER_SPACE_NUM'] = to_num_series_exact(df_deal_mansions['OWNER_SPACE'])
print(df_reports_mansion['OWNER_SPACE_NUM'].dtype)
print(df_deal_mansions['OWNER_SPACE_NUM'].dtype)

# 名前の正規化
df_reports_mansion['PROPERTY_NAME_N'] = normalize_text(df_reports_mansion['PROPERTY_NAME'])
df_deal_mansions['APARTMENT_NAME_N'] = normalize_text(df_deal_mansions['APARTMENT_NAME'])

# 面積40以上の絞り込み
df_reports_mansion = df_reports_mansion[df_reports_mansion['OWNER_SPACE_NUM'] >= 40].copy()
df_deal_mansions = df_deal_mansions[df_deal_mansions['OWNER_SPACE_NUM'] >= 40].copy()
print("面積40以上に絞り込み後df_reports_mansion",df_reports_mansion.shape)
print("面積40以上に絞り込み後df_deal_mansions",df_deal_mansions.shape)

# 小数第2位に丸める処理
df_reports_mansion['OWNER_SPACE_NUM_ROUND2'] = df_reports_mansion['OWNER_SPACE_NUM'].round(2)
df_deal_mansions['OWNER_SPACE_NUM_ROUND2'] = df_deal_mansions['OWNER_SPACE_NUM'].round(2)

# deal側に残すカラム定義（重複除去）
base_cols = ['C_ID', 'FLOOR_NUMBER', 'OWNER_SPACE_NUM_ROUND2', 'APARTMENT_NAME_N', 'ADDED_ON']
extra_cols = [col for col in df_deal_mansions.columns if col not in df_reports_mansion.columns and col not in base_cols]
deal_cols = base_cols + extra_cols  # 重複なしで構築

print(deal_cols)

# マージ処理
merged = df_reports_mansion.merge(
    df_deal_mansions[deal_cols],
    how='left',
    left_on=['C_ID', 'FLOOR_NUMBER', 'OWNER_SPACE_NUM_ROUND2', 'PROPERTY_NAME_N'],
    right_on=['C_ID', 'FLOOR_NUMBER', 'OWNER_SPACE_NUM_ROUND2', 'APARTMENT_NAME_N'],
    suffixes=('', '_DEAL'),
    indicator=True
)

print("マージ後件数",merged.shape)
# display(merged.head())

# 日付条件（6ヶ月以内）
merged['ASSESSED_ON'] = pd.to_datetime(merged['ASSESSED_ON'], errors='coerce')
merged['ADDED_ON'] = pd.to_datetime(merged['ADDED_ON'], errors='coerce')

date_ok = (
    merged['ASSESSED_ON'].notna() &
    merged['ADDED_ON'].notna() &
    (merged['ASSESSED_ON'] <= merged['ADDED_ON']) &
    (merged['ASSESSED_ON'] + pd.DateOffset(months=6) > merged['ADDED_ON'])
)

# フラグ付け
merged['IS_DEAL_MATCH'] = date_ok.astype(int)
print("フラグ付後の件数",merged.shape)
print("フラグ1件数:", merged['IS_DEAL_MATCH'].sum(), "/", merged.shape[0])

# 出力用にコピー（すべてのカラム含む）
df_mansion_flg = merged.copy()
print(df_mansion_flg.shape)
# ID単位で集約（1つでも一致があれば1）
df_mansion_flg_nodup = pd.concat([
    df_mansion_flg[df_mansion_flg['IS_DEAL_MATCH'] == 1]
        .drop_duplicates(subset='ID', keep='first'),
    df_mansion_flg[~df_mansion_flg['ID'].isin(df_mansion_flg['ID'][df_mansion_flg['IS_DEAL_MATCH'] == 1])]
        .drop_duplicates(subset='ID', keep='first')
])
print("IDで集約後の件数",df_mansion_flg_nodup.shape)
print("IDで集約後、フラグ1件数:", df_mansion_flg_nodup['IS_DEAL_MATCH'].sum(), "/", df_mansion_flg_nodup.shape[0])

float64
float64
面積40以上に絞り込み後df_reports_mansion (113031, 309)
面積40以上に絞り込み後df_deal_mansions (492789, 8)
['C_ID', 'FLOOR_NUMBER', 'OWNER_SPACE_NUM_ROUND2', 'APARTMENT_NAME_N', 'ADDED_ON', 'APARTMENT_NAME', ' ADDED_ON']
マージ後件数 (118912, 315)
フラグ付後の件数 (118912, 316)
フラグ1件数: 17201 / 118912
(118912, 316)
IDで集約後の件数 (113031, 316)
IDで集約後、フラグ1件数: 15962 / 113031


### 確認 ###

In [25]:
from IPython.display import display

# 変数 merged は既存のセルで定義済みのものを利用
merged_1 = merged.copy()
# 正しいフィルタ方法に修正
merged_1 = merged_1[merged_1["IS_DEAL_MATCH"] == 1]
# 日付条件の判定結果を確認
date_condition = (
    merged_1['ASSESSED_ON'].notna() &
    merged_1['ADDED_ON'].notna() &
    (merged_1['ASSESSED_ON'] <= merged_1['ADDED_ON']) &
    (merged_1['ASSESSED_ON'] + pd.DateOffset(months=6) > merged_1['ADDED_ON'])
)

print("日付条件を満たす件数:", date_condition.sum(), "/", len(merged_1))
print("日付条件を満たさないサンプル行:")
# ~ はブール値のSeriesに使う。merged[date_condition]はDataFrameなので、date_condition自体を反転する
display(merged_1[~date_condition].head())

日付条件を満たす件数: 17201 / 17201
日付条件を満たさないサンプル行:


Unnamed: 0,ID,USER_ID,IS_DEAL,PROPERTY_KIND,APARTMENT_ID,FLOOR_NUMBER,OWNER_SPACE,LAND_SPACE,BUILDING_SPACE,LAND_AND_BUILDING_SPACE,DIRECTION,ASSESSED_ON,CUSTOMER_NAME,PROPERTY_NAME,STAFF_NAME,COMPANY_DESCRIPTION,EMAIL,TEL,LINE_URL,HOMEPAGE,COVER_ASSESSED_ON,COVER_TITLE,TITLE_FONT_SIZE,COVER_CUSTOMER_NAME,COVER_STAFF_AND_COMPANY_NAME,COVER_DESCRIPTION,COVER_EMAIL,COVER_TEL,COVER_HOMEPAGE,COVER_STAFF_ROUND_IMAGE,COVER_STAFF_ROUND_IMAGE_COMMENT,ABOUT_PROPERTY_NAME,ABOUT_ADDRESS,LATITUDE,LONGITUDE,ABOUT_TRAFFIC,ABOUT_MANAGER,ABOUT_LAND_PRIVILEGE,ABOUT_REGISTRY_GROUND,ABOUT_CITY_PLAN,ABOUT_TERRAIN,ABOUT_LAND_SHAPE,ABOUT_SETBACK_SPACE,ABOUT_CURRENT_STATE,ABOUT_RECONSTRUCT_DISABLED,ABOUT_USAGE_AREA,ABOUT_STRUCTURE,STRUCTURE_ID,ABOUT_BUILT_IN,ABOUT_FLOOR,ABOUT_BUILDING_COVERAGE,ABOUT_CONNECT_ROAD_STATE,ABOUT_CONNECT_ROAD,ABOUT_CAPACITY_RATIO,ABOUT_BUILDING_COVERAGE_AND_CAPACITY_RATIO,ABOUT_UNIT_AMOUNT,ABOUT_SELLER,ABOUT_CONSTRUCTOR,ABOUT_REMARKS,ASSESS_SELL_TITLE,ASSESS_SELL_PRICE,ASSESS_SELL_SQUARE_PRICE,ASSESS_SELL_TSUBO_PRICE,ASSESS_SELL_TERM,ASSESS_SELL_PRICE_FROM,ASSESS_SELL_PRICE_TO,ASSESS_SELL_SELF_STORY_COUNT,ASSESS_SELL_SIMILAR_STORY_COUNT,ASSESS_SELL_BUILDING_PRICE,ASSESS_SELL_BUILDING_UNIT_PRICE,ASSESS_SELL_HOUSE_LAND_PRICE,ASSESS_SELL_COMMENT,ASSESS_HOUSE_COMMENT,ASSESS_SELL_PAGE_BREAK,INDICATE_VISIT_ASSESSMENT,VISIT_ASSESSMENT_PRICE,VISIT_ASSESSMENT_COMMENT,VISIT_ASSESSMENT_MEMO,VISIT_ASSESSMENT_TITLE,ASSESS_SUGGEST_PRICE,ASSESS_SUGGEST_COMMENT,ASSESS_SUGGEST_PAGE_BREAK,CHALLENGE1_TITLE,CHALLENGE1_PRICE,CHALLENGE1_TSUBO_PRICE,CHALLENGE1_TERM,CHALLENGE1_COMMENT,CHALLENGE1_PAGE_BREAK,CHALLENGE2_TITLE,CHALLENGE2_PRICE,CHALLENGE2_TSUBO_PRICE,CHALLENGE2_TERM,CHALLENGE2_COMMENT,CHALLENGE2_PAGE_BREAK,CHALLENGE3_TITLE,CHALLENGE3_PRICE,CHALLENGE3_TSUBO_PRICE,CHALLENGE3_TERM,CHALLENGE3_COMMENT,CHALLENGE3_PAGE_BREAK,ASSESS_PURCHASE_PRICE,ASSESS_PURCHASE_TERM,ASSESS_PURCHASE_COMMENT,ASSESS_PURCHASE_PAGE_BREAK,ASSESS_RENT_TITLE,ASSESS_RENT_PRICE,ASSESS_RENT_YIELD,ASSESS_RENT_COMMENT,ASSESS_RENT_PAGE_BREAK,INDICATE_DEMOLITION_COST,DEMOLITION_P_NAME,DEMOLITION_ABOUT_STRUCTURE,DEMOLITION_BUILDING_SPACE_TSUBO,DEMOLITION_ROAD_WIDTH,DEMOLITION_HOUSE_DISTANCE,DEMOLITION_COST,DEMOLITION_MIN_COST,DEMOLITION_MAX_COST,COVER_STATISTICS_TITLE,COVER_MARKET_TITLE,ASSESS_VISIT_PAGE_BREAK,ASSESS_STAFF_COMMENT,ASSESS_STAFF_PAGE_BREAK,AREA_HUMAN_COMMENT,SELL_FLOW1_TITLE,SELL_FLOW1_DETAIL,SELL_FLOW2_TITLE,SELL_FLOW2_DETAIL,SELL_FLOW3_TITLE,SELL_FLOW3_DETAIL,SELL_FLOW4_TITLE,SELL_FLOW4_DETAIL,SELL_FLOW5_TITLE,SELL_FLOW5_DETAIL,SELL_FLOW6_TITLE,SELL_FLOW6_DETAIL,SELL_FLOW7_TITLE,SELL_FLOW7_DETAIL,LAST_COMMENT,LAST_STAFF_IMAGE,LAST_STAFF_NAME,LAST_TEL,LAST_EMAIL,LAST_PROFILE,INDICATE_COVER_ASSESSED_ON,INDICATE_COVER_LOGO,INDICATE_COVER_CUSTOMER_NAME,INDICATE_COVER_ABOUT,INDICATE_COVER_STAFF_AND_COMPANY_NAME,INDICATE_COVER_DESCRIPTION,INDICATE_COVER_EMAIL,INDICATE_COVER_TEL,INDICATE_COVER_HOMEPAGE,INDICATE_COVER_STAFF_ROUND_IMAGE,INDICATE_STAFF_ROUND_IMAGE,INDICATE_COVER_STAFF_ROUND_IMAGE_COMMENT,INDICATE_ABOUT_INFO,INDICATE_ABOUT_EXCLUSIVE_INFO,INDICATE_ABOUT_MAP,INDICATE_ASSESS_VALUE,INDICATE_ASSESS_SELL,INDICATE_ASSESS_SELL_RANGE,INDICATE_ASSESS_SELL_COUNT,INDICATE_ASSESS_TK_BREAKDOWN,INDICATE_ASSESS_SUGGEST,INDICATE_ASSESS_PURCHASE,INDICATE_ASSESS_RENT,INDICATE_ASSESS_PROPERTY_INFO,INDICATE_ASSESS_VISIT,INDICATE_ASSESS_STAFF_COMMENT,INDICATE_SELL_STORY_SELF,INDICATE_SELL_STORY_SIMILAR,INDICATE_RENT_STORY_SELF,INDICATE_RENT_STORY_SIMILAR,INDICATE_COVER_STATISTICS,INDICATE_MARKET_REPORTS,INDICATE_PRICE_HISTORY,INDICATE_PRICE_HISTORY_GRAPH,INDICATE_PRICE_HISTORY_SELF_COMPARE,INDICATE_PRICE_HISTORY_AREA_COMPARE,INDICATE_AREA_HUMAN,INDICATE_AREA_HUMAN_AGE_GRAPH,INDICATE_AREA_HUMAN_TRANSITION_GRAPH,INDICATE_COVER_MARKET,INDICATE_MARKET_SIZE,INDICATE_MARKET_LAYOUT_PRICE,INDICATE_MARKET_TK_SPACE_PRICE,INDICATE_MARKET_OLD_PRICE,INDICATE_MARKET_SELL_TERM,INDICATE_MORTGAGE_INTEREST,INDICATE_FLOATING_INTEREST_COMMENT,INDICATE_TEN_YEARS_FIXED_INTEREST_COMMENT,INDICATE_ALL_YEARS_FIXED_INTEREST_COMMENT,FLOATING_INTEREST_COMMENT,TEN_YEARS_FIXED_INTEREST_COMMENT,ALL_YEARS_FIXED_INTEREST_COMMENT,INDICATE_MEDIATION_CONTRACT,INDICATE_EXPENSE,INDICATE_EXPENSE1,IS_EXPENSE1_REGULAR,EXPENSE1_SELL_PRICE,EXPENSE1_COMMISSION,EXPENSE1_COMMISSION_RATE,EXPENSE1_STAMP,EXPENSE1_REGISTRATION,EXPENSE1_RESIDUAL,EXPENSE1_DEMOLITION_COST_INDICATE,EXPENSE1_OTHER1_LABEL,EXPENSE1_OTHER1,EXPENSE1_OTHER2_LABEL,EXPENSE1_OTHER2,EXPENSE1_OTHER3_LABEL,EXPENSE1_OTHER3,EXPENSE1_COMMENT,EXPENSE1_TITLE,INDICATE_EXPENSE2,IS_EXPENSE2_REGULAR,EXPENSE2_SELL_PRICE,EXPENSE2_COMMISSION,EXPENSE2_COMMISSION_RATE,EXPENSE2_STAMP,EXPENSE2_REGISTRATION,EXPENSE2_RESIDUAL,EXPENSE2_DEMOLITION_COST_INDICATE,EXPENSE2_OTHER1_LABEL,EXPENSE2_OTHER1,EXPENSE2_OTHER2_LABEL,EXPENSE2_OTHER2,EXPENSE2_OTHER3_LABEL,EXPENSE2_OTHER3,EXPENSE2_COMMENT,EXPENSE2_TITLE,INDICATE_EXPENSE3,IS_EXPENSE3_REGULAR,EXPENSE3_SELL_PRICE,EXPENSE3_COMMISSION,EXPENSE3_COMMISSION_RATE,EXPENSE3_STAMP,EXPENSE3_REGISTRATION,EXPENSE3_RESIDUAL,EXPENSE3_DEMOLITION_COST_INDICATE,EXPENSE3_OTHER1_LABEL,EXPENSE3_OTHER1,EXPENSE3_OTHER2_LABEL,EXPENSE3_OTHER2,EXPENSE3_OTHER3_LABEL,EXPENSE3_OTHER3,EXPENSE3_COMMENT,EXPENSE3_TITLE,INDICATE_SELL_FLOW,INDICATE_SELL_FLOW1,INDICATE_SELL_FLOW2,INDICATE_SELL_FLOW3,INDICATE_SELL_FLOW4,INDICATE_SELL_FLOW5,INDICATE_SELL_FLOW6,INDICATE_SELL_FLOW7,INDICATE_CHALLENGE1,INDICATE_CHALLENGE2,INDICATE_CHALLENGE3,INDICATE_LAST_COMMENT,PAGE_BREAK_SELL_FLOW_AFTER4,PAGE_BREAK_SELL_FLOW_AFTER5,PAGE_BREAK_SELL_FLOW_AFTER6,PAGE_BREAK_BEFORE_LAST_COMMENT,CSS_DESIGN,CSS_COLOR,SP_DESIGN,INDICATE_COVER_INFO,INDICATE_ASSESS_INFO,INDICATE_STORIES_INFO,INDICATE_STATISTICS_INFO,INDICATE_MARKET_INFO,INDICATE_SELL_FLOW_INFO,INDICATE_INSERT_PAGE_TITLE_COVER,INDICATE_INSERT_PAGE_TITLE_ABOUT,INDICATE_INSERT_PAGE_TITLE_ASSESS,INDICATE_INSERT_PAGE_TITLE_STORIES,INDICATE_INSERT_PAGE_TITLE_STATISTICS,INDICATE_INSERT_PAGE_TITLE_MARKET,INDICATE_INSERT_PAGE_TITLE_SELL_FLOW,INDICATE_INSERT_PAGE_TITLE_EXPENSE,INDICATE_PAGE_NUM,COMPANY_SUMMARY,ABOUT_LAND_SPACE,ABOUT_BUILDING_SPACE,ABOUT_BUILDING_COVERAGE_UNIT,ABOUT_CAPACITY_RATIO_UNIT,COVER_COMPANY_TEL,INDICATE_COVER_COMPANY_TEL,PDF,COVER_PDF_TITLE,ABOUT_PDF_TITLE,ASSESS_PDF_TITLE,STORIES_PDF_TITLE,STATISTICS_PDF_TITLE,MARKET_PDF_TITLE,SELL_FLOW_PDF_TITLE,EXPENSE_PDF_TITLE,SECTION_ORDER,FONT,SP_READ_AT,SP_TOKEN,UPDATED_AT,IS_DELETED,TUTORIAL_SP,OWNER_SPACE_NUM,C_ID,PROPERTY_NAME_N,OWNER_SPACE_NUM_ROUND2,APARTMENT_NAME_N,ADDED_ON,APARTMENT_NAME,ADDED_ON.1,_merge,IS_DEAL_MATCH


### 保存 ###

In [29]:
# df_mansion_flg.to_csv(os.path.join(OUTPUT_DIR, "mansion_flg_from_python_ID重複あり.csv"), index=False)
df_mansion_flg_nodup.to_csv(os.path.join(OUTPUT_DIR, "2.mansion_flg_from_python.csv"), index=False)
    

In [30]:
print(df_mansion_flg.shape)
print(df_mansion_flg_nodup.shape)

(118912, 316)
(113031, 316)


## 土地 ##

In [31]:

#deal_landを土地で絞りこみ　
deal_land  = df_deal_lots[df_deal_lots['PROPERTY_KIND'] == 1].copy()

# 面積を小数第2位で丸め（完全一致前提）
df_reports_land['LAND_SPACE_NUM'] = to_num_series_exact(df_reports_land['LAND_SPACE'])
deal_land['LAND_SPACE_NUM']       = to_num_series_exact(deal_land['LAND_SPACE'])

#----------------------------C_IDについての処理---------------------------------------
# C_ID 正規化（先頭0保持・末尾.0除去）
df_reports_land['CID_N'] = normalize_cid_keep_zeros(df_reports_land['C_ID'])
deal_land['CID_N'] = normalize_cid_keep_zeros(deal_land['C_ID'])

print("deal_land件数",deal_land.shape)

# もしレポート側で先頭ゼロが落ちている可能性がある場合は、deal側の桁数に合わせて左ゼロ埋め
# 例：dealの数字のみ C_ID の最頻桁数をターゲットにする

deal_digits = deal_land['CID_N'].str.fullmatch(r'\d+')
if deal_digits.any():
    target_len = deal_land.loc[deal_digits, 'CID_N'].str.len().mode().iat[0]  # 最頻長
else:
    target_len = None

if target_len:
    df_reports_land['CID_N'] = np.where(
        df_reports_land['CID_N'].str.fullmatch(r'\d+'),
        df_reports_land['CID_N'].str.zfill(target_len),  # 数字のみなら左ゼロ埋め
        df_reports_land['CID_N']                         # 英字混在はそのまま
    )
else:
    df_reports_land['CID_N'] = df_reports_land['CID_N']
#----------------------------------------------------------------------------------

#一致件数チェック
cid_match = df_reports_land['CID_N'].isin(set(deal_land['CID_N']))
print("C_ID一致行数:", cid_match.sum(), "/", len(cid_match))

# マージ
m_land = df_reports_land.merge(
    deal_land[['CID_N', 'LAND_SPACE_NUM', 'ADDED_ON']],
    how='left',
    left_on=['CID_N', 'LAND_SPACE_NUM'],
    right_on=['CID_N', 'LAND_SPACE_NUM'],
    suffixes=('_report', '_deal'),
    indicator=True
)

print("マージ後件数",m_land.shape)

# 日付条件：ASSESSED_ON <= ADDED_ON < ASSESSED_ON + 6ヶ月　を見致していたらTrueを返す
m_land['ASSESSED_ON'] = pd.to_datetime(m_land['ASSESSED_ON'], errors='coerce')
m_land['ADDED_ON']    = pd.to_datetime(m_land['ADDED_ON'],    errors='coerce')
print(m_land[['ASSESSED_ON', 'ADDED_ON']].dtypes)

date_ok = (
    m_land['ASSESSED_ON'].notna() &
    m_land['ADDED_ON'].notna() &
    (m_land['ASSESSED_ON'] <= m_land['ADDED_ON']) &
    (m_land['ASSESSED_ON'] + pd.DateOffset(months=6) > m_land['ADDED_ON'])
)
print("日付条件OK行数:", date_ok.fillna(False).sum())

# フラグ：ADDED_ONが付与され、面積完全一致、日付条件クリア
m_land['HIT'] = m_land['ADDED_ON'].notna() & date_ok
print("HIT列のTrue件数:", m_land['HIT'].sum(), "/", m_land.shape[0])

# 各IDグループ内で、HIT列にTrueが1つでもあれば１とする
# フラグを作成
hit_flag = (
    m_land.groupby('ID')['HIT'].any()
    .rename('IS_DEAL_MATCH')
    .astype(int)
    .reset_index()
)
# フラグをマージ（ID ごとに）
df_land_flg = m_land.merge(hit_flag, on='ID', how='left')
print("フラグ付け後件数",df_land_flg.shape)
print("土地フラグ1件数:", df_land_flg['IS_DEAL_MATCH'].sum(), "/", df_land_flg.shape[0])

#重複削除
df_land_flg_nodup = (
    df_land_flg
    .sort_values(by='IS_DEAL_MATCH', ascending=False)  # フラグが1の方を上に
    .drop_duplicates(subset=['ID'])  # 最初の1件を残す（= フラグ1優先）
)
print("重複削除後件数:", df_land_flg_nodup.shape)
print("重複削除後、フラグ1件数:", df_land_flg_nodup['IS_DEAL_MATCH'].sum(), "/", df_land_flg_nodup.shape[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_land['LAND_SPACE_NUM'] = to_num_series_exact(df_reports_land['LAND_SPACE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_land['CID_N'] = normalize_cid_keep_zeros(df_reports_land['C_ID'])


deal_land件数 (795338, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_land['CID_N'] = np.where(


C_ID一致行数: 23388 / 24658
マージ後件数 (25256, 312)
ASSESSED_ON    datetime64[ns]
ADDED_ON       datetime64[ns]
dtype: object
日付条件OK行数: 3395
HIT列のTrue件数: 3395 / 25256
フラグ付け後件数 (25256, 314)
土地フラグ1件数: 3628 / 25256
重複削除後件数: (24658, 314)
重複削除後、フラグ1件数: 3196 / 24658


In [32]:
m_land[m_land['ID'] == 47286]

Unnamed: 0,ID,USER_ID,IS_DEAL,PROPERTY_KIND,APARTMENT_ID,FLOOR_NUMBER,OWNER_SPACE,LAND_SPACE,BUILDING_SPACE,LAND_AND_BUILDING_SPACE,DIRECTION,ASSESSED_ON,CUSTOMER_NAME,PROPERTY_NAME,STAFF_NAME,COMPANY_DESCRIPTION,EMAIL,TEL,LINE_URL,HOMEPAGE,COVER_ASSESSED_ON,COVER_TITLE,TITLE_FONT_SIZE,COVER_CUSTOMER_NAME,COVER_STAFF_AND_COMPANY_NAME,COVER_DESCRIPTION,COVER_EMAIL,COVER_TEL,COVER_HOMEPAGE,COVER_STAFF_ROUND_IMAGE,COVER_STAFF_ROUND_IMAGE_COMMENT,ABOUT_PROPERTY_NAME,ABOUT_ADDRESS,LATITUDE,LONGITUDE,C_ID,ABOUT_TRAFFIC,ABOUT_MANAGER,ABOUT_LAND_PRIVILEGE,ABOUT_REGISTRY_GROUND,ABOUT_CITY_PLAN,ABOUT_TERRAIN,ABOUT_LAND_SHAPE,ABOUT_SETBACK_SPACE,ABOUT_CURRENT_STATE,ABOUT_RECONSTRUCT_DISABLED,ABOUT_USAGE_AREA,ABOUT_STRUCTURE,STRUCTURE_ID,ABOUT_BUILT_IN,ABOUT_FLOOR,ABOUT_BUILDING_COVERAGE,ABOUT_CONNECT_ROAD_STATE,ABOUT_CONNECT_ROAD,ABOUT_CAPACITY_RATIO,ABOUT_BUILDING_COVERAGE_AND_CAPACITY_RATIO,ABOUT_UNIT_AMOUNT,ABOUT_SELLER,ABOUT_CONSTRUCTOR,ABOUT_REMARKS,ASSESS_SELL_TITLE,ASSESS_SELL_PRICE,ASSESS_SELL_SQUARE_PRICE,ASSESS_SELL_TSUBO_PRICE,ASSESS_SELL_TERM,ASSESS_SELL_PRICE_FROM,ASSESS_SELL_PRICE_TO,ASSESS_SELL_SELF_STORY_COUNT,ASSESS_SELL_SIMILAR_STORY_COUNT,ASSESS_SELL_BUILDING_PRICE,ASSESS_SELL_BUILDING_UNIT_PRICE,ASSESS_SELL_HOUSE_LAND_PRICE,ASSESS_SELL_COMMENT,ASSESS_HOUSE_COMMENT,ASSESS_SELL_PAGE_BREAK,INDICATE_VISIT_ASSESSMENT,VISIT_ASSESSMENT_PRICE,VISIT_ASSESSMENT_COMMENT,VISIT_ASSESSMENT_MEMO,VISIT_ASSESSMENT_TITLE,ASSESS_SUGGEST_PRICE,ASSESS_SUGGEST_COMMENT,ASSESS_SUGGEST_PAGE_BREAK,CHALLENGE1_TITLE,CHALLENGE1_PRICE,CHALLENGE1_TSUBO_PRICE,CHALLENGE1_TERM,CHALLENGE1_COMMENT,CHALLENGE1_PAGE_BREAK,CHALLENGE2_TITLE,CHALLENGE2_PRICE,CHALLENGE2_TSUBO_PRICE,CHALLENGE2_TERM,CHALLENGE2_COMMENT,CHALLENGE2_PAGE_BREAK,CHALLENGE3_TITLE,CHALLENGE3_PRICE,CHALLENGE3_TSUBO_PRICE,CHALLENGE3_TERM,CHALLENGE3_COMMENT,CHALLENGE3_PAGE_BREAK,ASSESS_PURCHASE_PRICE,ASSESS_PURCHASE_TERM,ASSESS_PURCHASE_COMMENT,ASSESS_PURCHASE_PAGE_BREAK,ASSESS_RENT_TITLE,ASSESS_RENT_PRICE,ASSESS_RENT_YIELD,ASSESS_RENT_COMMENT,ASSESS_RENT_PAGE_BREAK,INDICATE_DEMOLITION_COST,DEMOLITION_P_NAME,DEMOLITION_ABOUT_STRUCTURE,DEMOLITION_BUILDING_SPACE_TSUBO,DEMOLITION_ROAD_WIDTH,DEMOLITION_HOUSE_DISTANCE,DEMOLITION_COST,DEMOLITION_MIN_COST,DEMOLITION_MAX_COST,COVER_STATISTICS_TITLE,COVER_MARKET_TITLE,ASSESS_VISIT_PAGE_BREAK,ASSESS_STAFF_COMMENT,ASSESS_STAFF_PAGE_BREAK,AREA_HUMAN_COMMENT,SELL_FLOW1_TITLE,SELL_FLOW1_DETAIL,SELL_FLOW2_TITLE,SELL_FLOW2_DETAIL,SELL_FLOW3_TITLE,SELL_FLOW3_DETAIL,SELL_FLOW4_TITLE,SELL_FLOW4_DETAIL,SELL_FLOW5_TITLE,SELL_FLOW5_DETAIL,SELL_FLOW6_TITLE,SELL_FLOW6_DETAIL,SELL_FLOW7_TITLE,SELL_FLOW7_DETAIL,LAST_COMMENT,LAST_STAFF_IMAGE,LAST_STAFF_NAME,LAST_TEL,LAST_EMAIL,LAST_PROFILE,INDICATE_COVER_ASSESSED_ON,INDICATE_COVER_LOGO,INDICATE_COVER_CUSTOMER_NAME,INDICATE_COVER_ABOUT,INDICATE_COVER_STAFF_AND_COMPANY_NAME,INDICATE_COVER_DESCRIPTION,INDICATE_COVER_EMAIL,INDICATE_COVER_TEL,INDICATE_COVER_HOMEPAGE,INDICATE_COVER_STAFF_ROUND_IMAGE,INDICATE_STAFF_ROUND_IMAGE,INDICATE_COVER_STAFF_ROUND_IMAGE_COMMENT,INDICATE_ABOUT_INFO,INDICATE_ABOUT_EXCLUSIVE_INFO,INDICATE_ABOUT_MAP,INDICATE_ASSESS_VALUE,INDICATE_ASSESS_SELL,INDICATE_ASSESS_SELL_RANGE,INDICATE_ASSESS_SELL_COUNT,INDICATE_ASSESS_TK_BREAKDOWN,INDICATE_ASSESS_SUGGEST,INDICATE_ASSESS_PURCHASE,INDICATE_ASSESS_RENT,INDICATE_ASSESS_PROPERTY_INFO,INDICATE_ASSESS_VISIT,INDICATE_ASSESS_STAFF_COMMENT,INDICATE_SELL_STORY_SELF,INDICATE_SELL_STORY_SIMILAR,INDICATE_RENT_STORY_SELF,INDICATE_RENT_STORY_SIMILAR,INDICATE_COVER_STATISTICS,INDICATE_MARKET_REPORTS,INDICATE_PRICE_HISTORY,INDICATE_PRICE_HISTORY_GRAPH,INDICATE_PRICE_HISTORY_SELF_COMPARE,INDICATE_PRICE_HISTORY_AREA_COMPARE,INDICATE_AREA_HUMAN,INDICATE_AREA_HUMAN_AGE_GRAPH,INDICATE_AREA_HUMAN_TRANSITION_GRAPH,INDICATE_COVER_MARKET,INDICATE_MARKET_SIZE,INDICATE_MARKET_LAYOUT_PRICE,INDICATE_MARKET_TK_SPACE_PRICE,INDICATE_MARKET_OLD_PRICE,INDICATE_MARKET_SELL_TERM,INDICATE_MORTGAGE_INTEREST,INDICATE_FLOATING_INTEREST_COMMENT,INDICATE_TEN_YEARS_FIXED_INTEREST_COMMENT,INDICATE_ALL_YEARS_FIXED_INTEREST_COMMENT,FLOATING_INTEREST_COMMENT,TEN_YEARS_FIXED_INTEREST_COMMENT,ALL_YEARS_FIXED_INTEREST_COMMENT,INDICATE_MEDIATION_CONTRACT,INDICATE_EXPENSE,INDICATE_EXPENSE1,IS_EXPENSE1_REGULAR,EXPENSE1_SELL_PRICE,EXPENSE1_COMMISSION,EXPENSE1_COMMISSION_RATE,EXPENSE1_STAMP,EXPENSE1_REGISTRATION,EXPENSE1_RESIDUAL,EXPENSE1_DEMOLITION_COST_INDICATE,EXPENSE1_OTHER1_LABEL,EXPENSE1_OTHER1,EXPENSE1_OTHER2_LABEL,EXPENSE1_OTHER2,EXPENSE1_OTHER3_LABEL,EXPENSE1_OTHER3,EXPENSE1_COMMENT,EXPENSE1_TITLE,INDICATE_EXPENSE2,IS_EXPENSE2_REGULAR,EXPENSE2_SELL_PRICE,EXPENSE2_COMMISSION,EXPENSE2_COMMISSION_RATE,EXPENSE2_STAMP,EXPENSE2_REGISTRATION,EXPENSE2_RESIDUAL,EXPENSE2_DEMOLITION_COST_INDICATE,EXPENSE2_OTHER1_LABEL,EXPENSE2_OTHER1,EXPENSE2_OTHER2_LABEL,EXPENSE2_OTHER2,EXPENSE2_OTHER3_LABEL,EXPENSE2_OTHER3,EXPENSE2_COMMENT,EXPENSE2_TITLE,INDICATE_EXPENSE3,IS_EXPENSE3_REGULAR,EXPENSE3_SELL_PRICE,EXPENSE3_COMMISSION,EXPENSE3_COMMISSION_RATE,EXPENSE3_STAMP,EXPENSE3_REGISTRATION,EXPENSE3_RESIDUAL,EXPENSE3_DEMOLITION_COST_INDICATE,EXPENSE3_OTHER1_LABEL,EXPENSE3_OTHER1,EXPENSE3_OTHER2_LABEL,EXPENSE3_OTHER2,EXPENSE3_OTHER3_LABEL,EXPENSE3_OTHER3,EXPENSE3_COMMENT,EXPENSE3_TITLE,INDICATE_SELL_FLOW,INDICATE_SELL_FLOW1,INDICATE_SELL_FLOW2,INDICATE_SELL_FLOW3,INDICATE_SELL_FLOW4,INDICATE_SELL_FLOW5,INDICATE_SELL_FLOW6,INDICATE_SELL_FLOW7,INDICATE_CHALLENGE1,INDICATE_CHALLENGE2,INDICATE_CHALLENGE3,INDICATE_LAST_COMMENT,PAGE_BREAK_SELL_FLOW_AFTER4,PAGE_BREAK_SELL_FLOW_AFTER5,PAGE_BREAK_SELL_FLOW_AFTER6,PAGE_BREAK_BEFORE_LAST_COMMENT,CSS_DESIGN,CSS_COLOR,SP_DESIGN,INDICATE_COVER_INFO,INDICATE_ASSESS_INFO,INDICATE_STORIES_INFO,INDICATE_STATISTICS_INFO,INDICATE_MARKET_INFO,INDICATE_SELL_FLOW_INFO,INDICATE_INSERT_PAGE_TITLE_COVER,INDICATE_INSERT_PAGE_TITLE_ABOUT,INDICATE_INSERT_PAGE_TITLE_ASSESS,INDICATE_INSERT_PAGE_TITLE_STORIES,INDICATE_INSERT_PAGE_TITLE_STATISTICS,INDICATE_INSERT_PAGE_TITLE_MARKET,INDICATE_INSERT_PAGE_TITLE_SELL_FLOW,INDICATE_INSERT_PAGE_TITLE_EXPENSE,INDICATE_PAGE_NUM,COMPANY_SUMMARY,ABOUT_LAND_SPACE,ABOUT_BUILDING_SPACE,ABOUT_BUILDING_COVERAGE_UNIT,ABOUT_CAPACITY_RATIO_UNIT,COVER_COMPANY_TEL,INDICATE_COVER_COMPANY_TEL,PDF,COVER_PDF_TITLE,ABOUT_PDF_TITLE,ASSESS_PDF_TITLE,STORIES_PDF_TITLE,STATISTICS_PDF_TITLE,MARKET_PDF_TITLE,SELL_FLOW_PDF_TITLE,EXPENSE_PDF_TITLE,SECTION_ORDER,FONT,SP_READ_AT,SP_TOKEN,UPDATED_AT,IS_DELETED,TUTORIAL_SP,OWNER_SPACE_NUM,LAND_SPACE_NUM,CID_N,ADDED_ON,_merge,HIT
18895,47286,2386,0,2.0,,,,83.0,,83㎡,,2023-02-11,宮本,,谷口 和彦,ＭＥマイホーム計画京葉株式会社と申します。\r\nこの度は査定のお問い合わせ頂きありがとうご...,taniguchi@megroup-6.jp,047-401-3450,,https://www.megroup-6.jp/,発行日：2023年2月11日,千葉県船橋市\nご所有不動産査定書,2,宮本 様,谷口 和彦\n（MEマイホーム計画京葉株式会社）,ＭＥマイホーム計画京葉株式会社と申します。\nこの度は査定のお問い合わせ頂きありがとうござい...,taniguchi@megroup-6.jp,047-401-3450,https://www.megroup-6.jp/,,,,千葉県船橋市南本町,,,12204080000.0,,,,,,,,,,,第一種住居地域,,,,,,,,,,,,,,,2900,34,115,3,,,,,,,,一般的な売却プランです。ご所有不動産の販売活動から、売却完了まで当社が仲介をさせていただき、...,一戸建ての査定は、土地と建物でそれぞれ別の査定方法を用いて算出を行います。\n建物の査定方法...,0,0,,,,,,現在の金融機関の動向や、ご所有不動産の個別条件を考慮した上で、こちらの金額をご提案させていた...,0,,,,,,0,,,,,,0,,,,,,0,,1,当社がご所有不動産を買い取らせていただくプランです。短期間で確実に現金化をしたい場合におすす...,0,,,,当社が管理し、賃貸物件として借主をつけるプランです。借主がいる限り、利益を取得していくことが...,0,0,,,,,,,,,ご所有不動産の\n近隣エリアの統計情報,ご所有不動産の\n近隣エリアの不動産市場情報,0,ご所有不動産の査定をさせていただきました、谷口 和彦と申します。\n船橋市は、ここ5年程で売...,0,不動産の価格の動きはその地域に住んでいる人口（人数）と年齢構成に大きく影響されます。\n不動...,売却についてのお打合せ,・査定価格、売り出し価格のご提案　・諸費用のご説明　・売却活動のご説明,売却の準備・媒介契約の締結,・売り出し価格の決定 ・媒介契約の締結 ・販売図面の作成 ・HP掲載用写真撮影,買主様探し,・不動産流通機構（レインズ）に登録 ・各種、広告宣伝の実施,ご案内・購入申し込み,・買主様の物件見学、購入申し込み ・売主様の売却承諾,ご契約,・重要事項説明書の取り交わし ・売買契約の締結 ・手付金の受取,ご売却完了,・残代金の受領 ・所有権移転の登記手続き ・物件のお引渡し,,,不動産を売却するという経験は人生でそう何度もあるものではありません。\n当社で相談を受ける際...,,谷口 和彦,047-401-3450,taniguchi@megroup-6.jp,,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,,,,1,1,0,1,29000000,1023000,,10000,,,0,,,,,,,,,0,1,29000000,1023000,,10000,,,0,,,,,,,,,0,1,29000000,1023000,,10000,,,0,,,,,,,,,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0,8,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,MEマイホーム計画京葉株式会社\n千葉県船橋市新高根１丁目1-10 \n電話番号：047-4...,83㎡,,,,047-401-3450,1,千葉県船橋市ご所有不動産査定書_2023-02-11.pdf,,,,,,,,,,0,,5ZYyhVTkVujaEOo4tJm3pQ,2023-02-11 19:14:06.000,0,0,,83.0,12204078000,NaT,left_only,False


### 保存 ###

In [33]:
# df_land_flg.to_csv(os.path.join(OUTPUT_DIR,"land_flg_from_python_ID重複あり.csv"), index=False)
df_land_flg_nodup.to_csv(os.path.join(OUTPUT_DIR,"2.land_flg_from_python.csv"), index=False)

## 戸建て ##

In [34]:

# 前提：df_reports は IS_DELETED/期間 で既にフィルタ済み
deal_house = df_deal_lots[df_deal_lots['PROPERTY_KIND'] == 2].copy()

# 面積（2桁丸め・完全一致用）
df_reports_house['LAND_SPACE_NUM']     = to_num_series_exact(df_reports_house['LAND_SPACE'])
df_reports_house['BUILDING_SPACE_NUM'] = to_num_series_exact(df_reports_house['BUILDING_SPACE'])
deal_house['LAND_SPACE_NUM']           = to_num_series_exact(deal_house['LAND_SPACE'])
deal_house['BUILDING_SPACE_NUM']       = to_num_series_exact(deal_house['BUILDING_SPACE'])

#----------------------------C_IDについての処理---------------------------------------
# C_ID 正規化：先頭0保持＋末尾 .0 除去
df_reports_house['CID_N'] = normalize_cid_keep_zeros(df_reports_house['C_ID'])
deal_house['CID_N']       = normalize_cid_keep_zeros(deal_house['C_ID'])

print("deal_land件数",deal_house.shape)

# もしレポート側で先頭ゼロが落ちている可能性がある場合は、deal側の桁数に合わせて左ゼロ埋め
# 例：dealの数字のみ C_ID の最頻桁数をターゲットにする

deal_digits = deal_house['CID_N'].str.fullmatch(r'\d+')
if deal_digits.any():
    target_len = deal_house.loc[deal_digits, 'CID_N'].str.len().mode().iat[0]  # 最頻長
else:
    target_len = None

if target_len:
    df_reports_house['CID_N'] = np.where(
        df_reports_house['CID_N'].str.fullmatch(r'\d+'),
        df_reports_house['CID_N'].str.zfill(target_len),  # 数字のみなら左ゼロ埋め
        df_reports_house['CID_N']                         # 英字混在はそのまま
    )
else:
    df_reports_house['CID_N'] = df_reports_house['CID_N']
#----------------------------------------------------------------------------------

# CID_N で LEFT JOIN（1対多想定）
m_house = df_reports_house.merge(
    deal_house[['CID_N', 'LAND_SPACE_NUM', 'BUILDING_SPACE_NUM', 'ADDED_ON']],
    how='left',
    left_on=['CID_N', 'LAND_SPACE_NUM', 'BUILDING_SPACE_NUM'],
    right_on=['CID_N', 'LAND_SPACE_NUM', 'BUILDING_SPACE_NUM'],
    suffixes=('_report', '_deal'),
    indicator=True
)
print("マージ後件数", m_house.shape)

# 日付条件：ASSESSED_ON <= ADDED_ON < ASSESSED_ON + 6ヶ月
m_house['ASSESSED_ON'] = pd.to_datetime(m_house['ASSESSED_ON'], errors='coerce')
m_house['ADDED_ON'] = pd.to_datetime(m_house['ADDED_ON'], errors='coerce')
print(m_house[['ASSESSED_ON', 'ADDED_ON']].dtypes)

date_ok = (
    m_house['ASSESSED_ON'].notna() &
    m_house['ADDED_ON'].notna() &
    (m_house['ASSESSED_ON'] <= m_house['ADDED_ON']) &
    (m_house['ASSESSED_ON'] + pd.DateOffset(months=6) > m_house['ADDED_ON'])
)
print("日付条件OK行数:", date_ok.fillna(False).sum())

# フラグ：ADDED_ONが付き、面積2種が完全一致し、日付条件を満たす
m_house['HIT'] = m_house['ADDED_ON'].notna() & date_ok
print("HIT列のTrue件数:", m_house['HIT'].sum(), "/", m_house.shape[0])

#フラグ付け
hit_flag = (
    m_house.groupby('ID')['HIT'].any()
    .rename('IS_DEAL_MATCH')
    .astype(int)
    .reset_index()
)

# フラグをマージ（ID ごとに）
df_house_flg = m_house.merge(hit_flag, on='ID', how='left')
print("フラグ付け後件数（ID重複あり）",df_house_flg.shape)
print("土地フラグ1件数（ID重複あり）:", df_house_flg['IS_DEAL_MATCH'].sum(), "/", df_house_flg.shape[0])

#重複削除
df_house_flg_nodup = (
    df_house_flg
    .sort_values(by='IS_DEAL_MATCH', ascending=False)  # フラグが1の方を上に
    .drop_duplicates(subset=['ID'])  # 最初の1件を残す（= フラグ1優先）
)
print("重複削除後件数:", df_house_flg_nodup.shape)
print("重複削除後、フラグ1件数:", df_house_flg_nodup['IS_DEAL_MATCH'].sum(), "/", df_house_flg_nodup.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_house['LAND_SPACE_NUM']     = to_num_series_exact(df_reports_house['LAND_SPACE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_house['BUILDING_SPACE_NUM'] = to_num_series_exact(df_reports_house['BUILDING_SPACE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_hous

deal_land件数 (920142, 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports_house['CID_N'] = np.where(


マージ後件数 (70456, 313)
ASSESSED_ON    datetime64[ns]
ADDED_ON       datetime64[ns]
dtype: object
日付条件OK行数: 7718
HIT列のTrue件数: 7718 / 70456
フラグ付け後件数（ID重複あり） (70456, 315)
土地フラグ1件数（ID重複あり）: 8272 / 70456
重複削除後件数: (69099, 315)
重複削除後、フラグ1件数: 7279 / 69099


### 保存 ###

In [35]:
df_house_flg_nodup.to_csv(os.path.join(OUTPUT_DIR,"2.house_flg_from_python.csv"), index=False)

# ３つのフラグ付きデータを結合して保存 #

In [36]:
df_flg = pd.concat([df_mansion_flg_nodup, df_land_flg_nodup, df_house_flg_nodup], ignore_index=True)
df_flg.shape

(206788, 320)

In [38]:
print(df_reports_all.shape)
print(df_flg.shape)

(206788, 368)
(206788, 320)


In [39]:
df_reports_with_flg = pd.merge(
    df_reports_all,
    df_flg[['ID', 'IS_DEAL_MATCH']],
    on='ID',
    how='left'
)

In [40]:
print(df_reports_with_flg.shape)

(206788, 369)


### 保存 ###

In [41]:
df_reports_with_flg.to_csv(os.path.join(OUTPUT_DIR, "2.ALL_ASSESSMENT_REPORTS_WITH_FLG.csv"), index=False)