In [None]:
import pandas as pd
import numpy as np
import re
import math
import matplotlib.pyplot as plt
from typing import Union, Optional
import datetime
import japanize_matplotlib
from janome.tokenizer import Tokenizer
from datetime import datetime, timedelta
import urllib
import requests
import geocoder
import seaborn as sns
from geopy.geocoders import Nominatim

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_holidays = pd.read_csv('holidays_in_japan.csv')
df_venue = pd.read_csv('venue_information.csv')
df_match = pd.read_csv('match_reports.csv')
submit = pd.read_csv('submit.csv', header=None)

In [None]:
def merged(df, df_venue):
    df = pd.merge(df, df_venue[['venue', 'capacity', 'address']], on='venue', how='left')
    return df

df_train = merged(df_train, df_venue)
df_test = merged(df_test, df_venue)

In [None]:
columns_to_modify = ['home_team_score', 'away_team_score']

def process_column(s):
    if isinstance(s, str):
        s = s[:-3]
        s = re.sub(r'\d+', '', s)
        s = s.replace(' ', '')
    return s

for col in df_match.columns:
    if not col in columns_to_modify:
        df_match[col] = df_match[col].apply(process_column)

df_match = df_match.drop('id', axis=1)

In [None]:
# 日本語の曜日を取得する関数
def get_day_of_week_jp(dt):
    w_list = ['月曜日', '火曜日', '水曜日', '木曜日', '金曜日', '土曜日', '日曜日']
    return w_list[dt.weekday()]

# 日付情報をデータフレームに追加する関数
def add_date_info(df):
    df['match_date'] = pd.to_datetime(df['match_date'])  # 日付列をDatetimeに変換
    df['year'] = df['match_date'].dt.year.astype(str) + '年'
    df['dayofweek'] = df['match_date'].apply(get_day_of_week_jp)
    return df

# df_trainに日付情報を追加
df_train = add_date_info(df_train)
df_test = add_date_info(df_test)

In [None]:
def label_holidays(df, df_holidays):
    # 日付の形式を揃える
    df['match_date'] = pd.to_datetime(df['match_date'])
    df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])

    # 祝日リストをセットに変換して高速化
    holiday_set = set(df_holidays['holiday_date'])

    # 曜日と祝日ラベルを結合
    df['dayofweek'] = df.apply(
        lambda row: '祝日' if row['match_date'] in holiday_set and row['dayofweek'] not in ['土曜日', '日曜日'] else row['dayofweek'],
        axis=1
    )

    return df

# 祝日情報の更新
df_train = label_holidays(df_train, df_holidays)
df_test = label_holidays(df_test, df_holidays)

## 2015年、2016年は1st、2ndに分かれているため、第34節に書き換える

In [None]:
# 変更する月リスト
months_to_change = ['7月', '8月', '9月', '10月', '11月']

def update_section(row):
    # year と month を直接プロパティから取得
    year = str(row['match_date'].year) + '年'
    month = row['match_date'].month

    # 月を数字から日本語の表記に変換
    month_jp = f"{month}月"

    if year == '2015年' and month_jp in months_to_change:
        num = int(row['section'][1:-1]) + 17
        return f'第{num}節'
    elif year == '2016年' and month_jp in months_to_change:
        num = int(row['section'][1:-1]) + 17
        return f'第{num}節'
    else:
        return row['section']

df_train['section'] = df_train.apply(update_section, axis=1)

## 変数作成

In [None]:
def insert_columns(
        df: pd.DataFrame,
        data: Union[pd.Series, pd.DataFrame],
        *,
        before: Optional[str] = None,
        after: Optional[str] = None,
        allow_duplicates: bool = False,
        inplace: bool = False,
    ) -> pd.DataFrame:
    """
    DataFrameに新しい列または複数の列を挿入します。

    Parameters:
    - df: pd.DataFrame
        列を挿入するDataFrame
    - data: Union[pd.Series, pd.DataFrame]
        挿入するデータ。pd.Seriesの場合、単一列として挿入。
        pd.DataFrameの場合、複数列を挿入。
    - before: Optional[str], default=None
        指定された列の前に挿入する。`after` と併用不可。
    - after: Optional[str], default=None
        指定された列の後に挿入する。`before` と併用不可。
    - allow_duplicates: bool, default=False
        Trueの場合、重複した列名を許可。
    - inplace: bool, default=False
        Trueの場合、元のDataFrameを直接変更。Falseの場合、新しいDataFrameを返す。

    Returns:
    - pd.DataFrame
        列が挿入されたDataFrame。
    """
    if not inplace:
        df = df.copy()

    if not (after is None) ^ (before is None):
        raise ValueError('You must specify either "before" or "after", but not both.')

    # 挿入位置を決定
    loc = df.columns.get_loc(before) if before else df.columns.get_loc(after) + 1

    # データの型チェックと列の挿入
    if isinstance(data, pd.Series):
        if data.name in df.columns and not allow_duplicates:
            raise ValueError(f'Column "{data.name}" already exists and allow_duplicates is set to False.')
        df.insert(loc, data.name, data, allow_duplicates=allow_duplicates)
    elif isinstance(data, pd.DataFrame):
        if not allow_duplicates:
            for column in data.columns:
                if column in df.columns:
                    raise ValueError(f'Column "{column}" already exists and allow_duplicates is set to False.')
        for column in reversed(data.columns):
            df.insert(loc, column, data[column], allow_duplicates=allow_duplicates)
    else:
        raise TypeError('data must be a pd.Series or pd.DataFrame')

    return df


In [None]:
# 'id', 'home_team_score', 'away_team_score' 以外の列を取得
columns_to_add = [col for col in df_match.columns if col not in ['id']]

# df_trainに列を追加
for i, col in enumerate(columns_to_add):
    df_train.insert(5 + i, col, df_match.loc[:len(df_train)-1, col])

# df_testに列を追加
for i, col in enumerate(columns_to_add):
    df_test.insert(5 + i, col, df_match.loc[len(df_train):len(df_train) + len(df_test)-1, col].reset_index(drop=True))

## 名前の統一

In [None]:
def name_home(df):
    for i in range(len(df)):
        if df['home_team'][i] == '川崎F':
            df['home_team'][i] = '川崎Ｆ'
        elif df['home_team'][i] == 'C大阪':
            df['home_team'][i] = 'Ｃ大阪'
        elif df['home_team'][i] == 'G大阪':
            df['home_team'][i] = 'Ｇ大阪'
    return df

def name_away(df):
    for i in range(len(df)):
        if df['away_team'][i] == '川崎F':
            df['away_team'][i] = '川崎Ｆ'
        elif df['away_team'][i] == 'C大阪':
            df['away_team'][i] = 'Ｃ大阪'
        elif df['away_team'][i] == 'G大阪':
            df['away_team'][i] = 'Ｇ大阪'
    return df

df_train = name_home(df_train)
df_train = name_away(df_train)
df_test = name_home(df_test)
df_test = name_away(df_test)

## 年、月、日、曜日をsinとcosに

In [None]:
def add_cyclic_features(df, date_column):
    """
    日付データをサイン・コサイン変換し、周期性を考慮した特徴量をデータフレームに追加する関数。

    Parameters:
    - df (pd.DataFrame): 日付を含むデータフレーム。
    - date_column (str): 日付が含まれるカラム名。

    Returns:
    - pd.DataFrame: サイン・コサイン変換された特徴量を含むデータフレーム。
    """

    # 日付をdatetime型に変換
    df[date_column] = pd.to_datetime(df[date_column])

    # 年周期のサイン・コサイン変換
    df['day_of_year'] = df[date_column].dt.dayofyear
    def get_days_in_year(year):
        return 366 if year % 4 == 0 else 365
    df['days_in_year'] = df[date_column].dt.year.apply(get_days_in_year)
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / df['days_in_year'])
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / df['days_in_year'])

    # 月周期のサイン・コサイン変換
    df['day_of_month'] = df[date_column].dt.day
    def get_days_in_month(date):
        if date.month == 2:
            return 29 if date.year % 4 == 0 else 28
        elif date.month in [4, 6, 9, 11]:
            return 30
        else:
            return 31
    df['days_in_month'] = df[date_column].apply(get_days_in_month)
    df['day_of_month_sin'] = np.sin(2 * np.pi * df['day_of_month'] / df['days_in_month'])
    df['day_of_month_cos'] = np.cos(2 * np.pi * df['day_of_month'] / df['days_in_month'])

    # 曜日周期のサイン・コサイン変換
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # 不要な変数を削除
    columns_to_drop = [
        'day_of_year', 'days_in_year',
        'day_of_month', 'days_in_month',
        'day_of_week'
    ]
    df.drop(columns=columns_to_drop, inplace=True)

    return df

df_train = add_cyclic_features(df_train, 'match_date')
df_test = add_cyclic_features(df_test, 'match_date')

In [None]:
def kick_off_time(df, date_column):
    # まず、日付列が datetime 型であることを確認し、そうでない場合は変換する
    if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
        df[date_column] = pd.to_datetime(df[date_column])

    df['hour_of_day'] = df[date_column].dt.hour
    df['minute_of_day'] = df[date_column].dt.minute
    df['total_seconds'] = df['hour_of_day'] * 3600 + df['minute_of_day'] * 60
    df['time_of_day_sin'] = np.sin(2 * np.pi * df['total_seconds'] / 86400)
    df['time_of_day_cos'] = np.cos(2 * np.pi * df['total_seconds'] / 86400)

    # 'second_of_day' は未定義なので、リストから削除
    columns_to_drop = ['hour_of_day', 'minute_of_day', 'total_seconds']
    df.drop(columns=columns_to_drop, inplace=True)

    return df

df_train = kick_off_time(df_train, 'kick_off_time')
df_test = kick_off_time(df_test, 'kick_off_time')

## 天気

In [None]:
t = Tokenizer()

for i in range(len(df_train)):
    tk = [token.surface for token in t.tokenize(df_train['weather'][i])]

    j = 0
    while j < len(tk):
        if tk[j] in ['々', '-']:
            tk.pop(j)
        elif tk[j] == '時':
            tk[j] = '時々'
            j += 1
        elif tk[j] not in ['晴', '曇', '雨', 'のち', '時々', '時']:
            tk[j] = 'その他'
            j += 1
        else:
            j += 1

    df_train['weather'][i] = " ".join(tk)

for i in range(len(df_test)):
    tk = [token.surface for token in t.tokenize(df_test['weather'][i])]

    j = 0
    while j < len(tk):
        if tk[j] in ['々', '-']:
            tk.pop(j)
        elif tk[j] == '時':
            tk[j] = '時々'
            j += 1
        elif tk[j] not in ['晴', '曇', '雨', 'のち', '時々', '時']:
            tk[j] = 'その他'
            j += 1
        else:
            j += 1

    df_test['weather'][i] = " ".join(tk)

## 同じ会場が別の名前で登録されているかどうか

In [None]:
usage_counts = pd.crosstab(df_train['venue'], df_train['year'])

# ヒートマップを作成
plt.figure(figsize=(24, 18))
sns.heatmap(usage_counts, annot=True, cmap='coolwarm', linewidths=0.5)

# グラフのラベルとタイトルを設定
plt.title('Venue Usage by Year')
plt.ylabel('Venue')
plt.xlabel('Year')

# ヒートマップを表示
plt.show()

In [None]:
for i in range(len(df_train)):
    if df_train['venue'][i] == '広島ビッグアーチ':
        df_train['venue'][i] = 'エディオンスタジアム広島'
    elif df_train['venue'][i] == '大阪長居スタジアム':
        df_train['venue'][i] = 'ヤンマースタジアム長居'
    elif df_train['venue'][i] == '大分銀行ドーム':
        df_train['venue'][i] = '昭和電工ドーム大分'
    elif df_train['venue'][i] == '日本平スタジアム':
        df_train['venue'][i] = 'IAIスタジアム日本平'
    elif df_train['venue'][i] == 'アウトソーシングスタジアム日本平':
        df_train['venue'][i] = 'IAIスタジアム日本平'
    elif df_train['venue'][i] == '新潟スタジアム':
        df_train['venue'][i] = 'デンカビッグスワンスタジアム'
    elif df_train['venue'][i] == '名古屋市瑞穂球技場':
        df_train['venue'][i] = 'パロマ瑞穂スタジアム'
        df_train['address'][i] = '愛知県名古屋市瑞穂区山下通5-1'
    elif df_train['venue'][i] == '名古屋市瑞穂陸上競技場':
        df_train['venue'][i] = 'パロマ瑞穂スタジアム'
        df_train['address'][i] = '愛知県名古屋市瑞穂区山下通5-1'
    elif df_train['venue'][i] == 'ホームズスタジアム神戸':
        df_train['venue'][i] = 'ノエビアスタジアム神戸'
    elif df_train['venue'][i] == '山梨県小瀬スポーツ公園陸上競技場':
        df_train['venue'][i] = '山梨中銀スタジアム'
        df_train['address'][i] = '山梨県甲府市小瀬町840'
    elif df_train['venue'][i] == '静岡スタジアムエコパ':
        df_train['venue'][i] = 'エコパスタジアム'
        df_train['address'][i] = '静岡県袋井市愛野2300−1'
    elif df_train['venue'][i] == '東北電力ビッグスワンスタジアム':
        df_train['venue'][i] = 'デンカビッグスワンスタジアム'
        df_train['address'][i] = '新潟県新潟市中央区清五郎67-12'
    elif df_train['venue'][i] == 'Shonan BMW スタジアム平塚':
        df_train['venue'][i] = 'ShonanBMWスタジアム平塚'
    elif df_train['venue'][i] == '市立吹田サッカースタジアム':
        df_train['venue'][i] = 'パナソニック スタジアム 吹田'
    elif df_train['venue'][i] == '日立柏サッカー場':
        df_train['venue'][i] = '三協フロンテア柏スタジアム'

In [None]:
for i in range(len(df_test)):
    if df_test['venue'][i] == '広島ビッグアーチ':
        df_test['venue'][i] = 'エディオンスタジアム広島'
    elif df_test['venue'][i] == '大阪長居スタジアム':
        df_test['venue'][i] = 'ヤンマースタジアム長居'
    elif df_test['venue'][i] == '大分銀行ドーム':
        df_test['venue'][i] = '昭和電工ドーム大分'
    elif df_test['venue'][i] == '日本平スタジアム':
        df_test['venue'][i] = 'IAIスタジアム日本平'
    elif df_test['venue'][i] == 'アウトソーシングスタジアム日本平':
        df_test['venue'][i] = 'IAIスタジアム日本平'
    elif df_test['venue'][i] == '新潟スタジアム':
        df_test['venue'][i] = 'デンカビッグスワンスタジアム'
    elif df_test['venue'][i] == '名古屋市瑞穂球技場':
        df_test['venue'][i] = 'パロマ瑞穂スタジアム'
        df_test['address'][i] = '愛知県名古屋市瑞穂区山下通5-1'
    elif df_test['venue'][i] == '名古屋市瑞穂陸上競技場':
        df_test['venue'][i] = 'パロマ瑞穂スタジアム'
        df_test['address'][i] = '愛知県名古屋市瑞穂区山下通5-1'
    elif df_test['venue'][i] == 'ホームズスタジアム神戸':
        df_test['venue'][i] = 'ノエビアスタジアム神戸'
    elif df_test['venue'][i] == '山梨県小瀬スポーツ公園陸上競技場':
        df_test['venue'][i] = '山梨中銀スタジアム'
        df_test['address'][i] = '山梨県甲府市小瀬町840'
    elif df_test['venue'][i] == '静岡スタジアムエコパ':
        df_test['venue'][i] = 'エコパスタジアム'
        df_test['address'][i] = '静岡県袋井市愛野2300−1'
    elif df_test['venue'][i] == '東北電力ビッグスワンスタジアム':
        df_test['venue'][i] = 'デンカビッグスワンスタジアム'
        df_test['address'][i] = '新潟県新潟市中央区清五郎67-12'
    elif df_test['venue'][i] == 'Shonan BMW スタジアム平塚':
        df_test['venue'][i] = 'ShonanBMWスタジアム平塚'
    elif df_test['venue'][i] == '市立吹田サッカースタジアム':
        df_test['venue'][i] = 'パナソニック スタジアム 吹田'
    elif df_test['venue'][i] == '日立柏サッカー場':
        df_test['venue'][i] = '三協フロンテア柏スタジアム'

In [None]:
usage_counts = pd.crosstab(df_train['venue'], df_train['year'])

# ヒートマップを作成
plt.figure(figsize=(24, 18))
sns.heatmap(usage_counts, annot=True, cmap='coolwarm', linewidths=0.5)

# グラフのラベルとタイトルを設定
plt.title('Venue Usage by Year')
plt.ylabel('Venue')
plt.xlabel('Year')

# ヒートマップを表示
plt.show()

## 休日かどうか

In [None]:
def is_holiday(row):
    if row['dayofweek'] in ['土曜日', '日曜日', '祝日']:
        return 1
    else:
        return 0

df_train['is_holiday'] = df_train.apply(is_holiday, axis=1)
df_test['is_holiday'] = df_test.apply(is_holiday, axis=1)

## 次の日休みであるかどうか

In [None]:
holiday_set = set(df_holidays['holiday_date'])

def delay_one_day(date):
    # 入力をTimestampとして扱う
    if isinstance(date, pd.Timestamp):
        delayed_date = date + timedelta(days=1)
    else:
        delayed_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=1)
    return delayed_date.strftime('%Y-%m-%d')

def check_next_day_holiday(row):
    date_str = row['match_date']
    day_of_week = row['dayofweek']

    if day_of_week in ['金曜日', '土曜日']:
        return 1
    next_day = delay_one_day(date_str)
    return 1 if next_day in holiday_set else 0

# 新しいカラムをデータフレームに追加
df_train['rest'] = df_train.apply(check_next_day_holiday, axis=1)
df_test['rest'] = df_test.apply(check_next_day_holiday, axis=1)

## 何連休か

In [None]:
# 連休日数を計算する関数
def calculate_consecutive_holidays(df_train, holiday_set):
    # 日付をTimestampに変換
    df_train['match_date'] = pd.to_datetime(df_train['match_date'])

    # 連休日数カラムを初期化
    df_train['holiday_streak'] = 0

    # 全ての日付をセットに変換
    all_dates = set(df_train['match_date'].astype(str))

    # 各行について連休を計算
    for index, row in df_train.iterrows():
        date_str = row['match_date'].strftime('%Y-%m-%d')
        day_of_week = row['dayofweek']

        # その日が休日かどうかをチェック
        if date_str in holiday_set or day_of_week in ['土曜日', '日曜日']:
            streak_count = 1
            current_date = row['match_date']

            # 連続する休日をカウント
            while True:
                next_date = current_date + timedelta(days=1)
                next_date_str = next_date.strftime('%Y-%m-%d')
                next_day_of_week = next_date.strftime('%A')
                if (next_date_str in holiday_set) or (next_date.strftime('%A') in ['Saturday', 'Sunday']):
                    streak_count += 1
                    current_date = next_date
                else:
                    break

            # 連休日数を設定
            df_train.at[index, 'holiday_streak'] = streak_count
        else:
            df_train.at[index, 'holiday_streak'] = 0

    return df_train

# 連休日数を計算してデータフレームに追加
df_train = calculate_consecutive_holidays(df_train, holiday_set)
df_test = calculate_consecutive_holidays(df_test, holiday_set)

## 放送局数

In [None]:
df_train['broadcasters'] = df_train['broadcasters'].apply(lambda x: len(x.split('/')))
df_test['broadcasters'] = df_test['broadcasters'].apply(lambda x: len(x.split('/')))

## 試合結果

In [None]:
def add_match_result(df):
    match_result = []
    for i in range(len(df)):
        match_result.append(1)

    df_match_result = pd.DataFrame(match_result, columns=['match_result'])

    df = insert_columns(df, df_match_result, after='home_team_score')

    for i in range(len(df)):
        if df['home_team_score'][i] == df['away_team_score'][i]:
            df['match_result'][i] = '引き分け'
        elif df['home_team_score'][i] > df['away_team_score'][i]:
            df['match_result'][i] = 'ホーム勝ち'
        else:
            df['match_result'][i] = 'ホーム負け'
    return df

df_train = add_match_result(df_train)
df_test = add_match_result(df_test)

## 試合前の順位算出

In [None]:
def ranking(df_train_year):
    # 各チームの節ごとの勝ち点、得失点差、総得点、失点、勝ち数、引き分け数、負け数、連勝、直近5試合の勝ち数、負け数を保持するデータフレームを作成します。
    df_standings = pd.DataFrame({
        'チーム': df_train_year['home_team'].unique(),
        '勝ち点': 0,
        '得失点差': 0,
        '総得点': 0,
        '失点': 0,
        '勝': 0,
        '分': 0,
        '負': 0,
        '連勝': 0,
        '直近5試合勝数': 0,
        '直近5試合負数': 0,
    })

    # 連勝と直近5試合の勝ち・負けを管理する辞書を作成します。
    winning_streak = {team: 0 for team in df_standings['チーム']}
    last_5_matches = {team: [] for team in df_standings['チーム']}  # 直近5試合の結果を保存する辞書

    ranking_per_round = []

    # 各節ごとにデータを処理します。
    for round_num in df_train_year['section'].unique():
        # その節の結果を取得します。
        df_round = df_train_year[df_train_year['section'] == round_num]

        # 各試合の結果を反映させます。
        for _, match in df_round.iterrows():
            home_team = match['home_team']
            away_team = match['away_team']
            home_score = match['home_team_score']
            away_score = match['away_team_score']

            if home_score > away_score:
                home_points = 3
                away_points = 0
                home_result = 'W'  # 勝ち
                away_result = 'L'  # 負け

                df_standings.loc[df_standings['チーム'] == home_team, '勝'] += 1
                df_standings.loc[df_standings['チーム'] == home_team, '連勝'] = winning_streak[home_team] = winning_streak[home_team] + 1
                df_standings.loc[df_standings['チーム'] == away_team, '負'] += 1
                df_standings.loc[df_standings['チーム'] == away_team, '連勝'] = winning_streak[away_team] = 0
            elif home_score == away_score:
                home_points = 1
                away_points = 1
                home_result = 'D'  # 引き分け
                away_result = 'D'  # 引き分け

                df_standings.loc[df_standings['チーム'] == home_team, '分'] += 1
                df_standings.loc[df_standings['チーム'] == away_team, '分'] += 1
                df_standings.loc[df_standings['チーム'] == home_team, '連勝'] = winning_streak[home_team] = 0
                df_standings.loc[df_standings['チーム'] == away_team, '連勝'] = winning_streak[away_team] = 0
            else:
                home_points = 0
                away_points = 3
                home_result = 'L'  # 負け
                away_result = 'W'  # 勝ち

                df_standings.loc[df_standings['チーム'] == home_team, '負'] += 1
                df_standings.loc[df_standings['チーム'] == home_team, '連勝'] = winning_streak[home_team] = 0
                df_standings.loc[df_standings['チーム'] == away_team, '勝'] += 1
                df_standings.loc[df_standings['チーム'] == away_team, '連勝'] = winning_streak[away_team] = winning_streak[away_team] + 1

            home_goal_diff = home_score - away_score
            df_standings.loc[df_standings['チーム'] == home_team, '勝ち点'] += home_points
            df_standings.loc[df_standings['チーム'] == home_team, '得失点差'] += home_goal_diff
            df_standings.loc[df_standings['チーム'] == home_team, '総得点'] += home_score
            df_standings.loc[df_standings['チーム'] == home_team, '失点'] += away_score

            away_goal_diff = -home_goal_diff
            df_standings.loc[df_standings['チーム'] == away_team, '勝ち点'] += away_points
            df_standings.loc[df_standings['チーム'] == away_team, '得失点差'] += away_goal_diff
            df_standings.loc[df_standings['チーム'] == away_team, '総得点'] += away_score
            df_standings.loc[df_standings['チーム'] == away_team, '失点'] += home_score

            # 直近5試合の結果を更新します。
            last_5_matches[home_team].append(home_result)
            last_5_matches[away_team].append(away_result)

            # 直近5試合の勝ち数と負け数を更新します。
            for team in [home_team, away_team]:
                if len(last_5_matches[team]) > 5:
                    last_5_matches[team].pop(0)  # 古い試合結果を削除

                recent_wins = last_5_matches[team].count('W')
                recent_losses = last_5_matches[team].count('L')

                df_standings.loc[df_standings['チーム'] == team, '直近5試合勝数'] = recent_wins
                df_standings.loc[df_standings['チーム'] == team, '直近5試合負数'] = recent_losses

        # 順位を計算します。
        df_standings.sort_values(['勝ち点', '得失点差', '総得点'], ascending=False, inplace=True)
        df_standings['順位'] = range(1, len(df_standings) + 1)

        # 現在の順位表を保存します。節番号も追加します。
        df_standings_round = df_standings.copy()
        df_standings_round['節'] = round_num
        ranking_per_round.append(df_standings_round)

    # 全節終了後の順位表を連結します。
    df_ranking = pd.concat(ranking_per_round)
    df_ranking = df_ranking.reset_index(drop=True)

    return df_ranking

In [None]:
df_train_2006 = df_train[df_train['year'] == '2006年']
df_train_2007 = df_train[df_train['year'] == '2007年']
df_train_2008 = df_train[df_train['year'] == '2008年']
df_train_2009 = df_train[df_train['year'] == '2009年']
df_train_2010 = df_train[df_train['year'] == '2010年']
df_train_2011 = df_train[df_train['year'] == '2011年']
df_train_2012 = df_train[df_train['year'] == '2012年']
df_train_2013 = df_train[df_train['year'] == '2013年']
df_train_2014 = df_train[df_train['year'] == '2014年']
df_train_2015 = df_train[df_train['year'] == '2015年']
df_train_2016 = df_train[df_train['year'] == '2016年']
df_train_2017 = df_train[df_train['year'] == '2017年']
df_test_2018 = df_test[df_test['year'] == '2018年']
df_test_2019 = df_test[df_test['year'] == '2019年']
df_train_2006 = ranking(df_train_2006)
df_train_2007 = ranking(df_train_2007)
df_train_2008 = ranking(df_train_2008)
df_train_2009 = ranking(df_train_2009)
df_train_2010 = ranking(df_train_2010)
df_train_2011 = ranking(df_train_2011)
df_train_2012 = ranking(df_train_2012)
df_train_2013 = ranking(df_train_2013)
df_train_2014 = ranking(df_train_2014)
df_train_2015 = ranking(df_train_2015)
df_train_2016 = ranking(df_train_2016)
df_train_2017 = ranking(df_train_2017)
df_test_2018 = ranking(df_test_2018)
df_test_2019 = ranking(df_test_2019)
df_train_2006['year'] = '2006'
df_train_2007['year'] = '2007'
df_train_2008['year'] = '2008'
df_train_2009['year'] = '2009'
df_train_2010['year'] = '2010'
df_train_2011['year'] = '2011'
df_train_2012['year'] = '2012'
df_train_2013['year'] = '2013'
df_train_2014['year'] = '2014'
df_train_2015['year'] = '2015'
df_train_2016['year'] = '2016'
df_train_2017['year'] = '2017'
df_test_2018['year'] = '2018'
df_test_2019['year'] = '2019'

In [None]:
df_train[['ホーム勝ち点', 'ホーム得失点差', 'ホーム総得点', 'ホーム失点', 'ホーム勝数', 'ホーム分数',
          'ホーム負数', 'ホーム連勝', 'ホーム直近5試合勝数','ホーム直近5試合負数']] = 0
df_train[['アウェー勝ち点', 'アウェー得失点差', 'アウェー総得点', 'アウェー失点', 'アウェー勝数', 'アウェー分数',
          'アウェー負数', 'アウェー連勝', 'アウェー直近5試合勝数','アウェー直近5試合負数']] = 0
df_test[['ホーム勝ち点', 'ホーム得失点差', 'ホーム総得点', 'ホーム失点', 'ホーム勝数', 'ホーム分数',
          'ホーム負数', 'ホーム連勝', 'ホーム直近5試合勝数','ホーム直近5試合負数']] = 0
df_test[['アウェー勝ち点', 'アウェー得失点差', 'アウェー総得点', 'アウェー失点', 'アウェー勝数', 'アウェー分数',
          'アウェー負数', 'アウェー連勝', 'アウェー直近5試合勝数','アウェー直近5試合負数']] = 0

In [None]:
df_train['year'] = df_train['year'].str.extract('(\d+)').astype(int)
df_test['year'] = df_test['year'].str.extract('(\d+)').astype(int)

df_train['home_team_before_rank'] = 0
df_train['away_team_before_rank'] = 0
df_test['home_team_before_rank'] = 0
df_test['away_team_before_rank'] = 0

In [None]:
for i in range(len(df_train)):
    if df_train['year'][i] == 2006:
        if df_train['section'][i] == '第1節':
            df_train['home_team_before_rank'][i] = 'データなし'
            df_train['away_team_before_rank'][i] = 'データなし'
        else:
            idx = df_train_2006[df_train_2006['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2007:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2006['チーム'].unique():
                idx = df_train_2006[df_train_2006['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2006['チーム'].unique():
                idx = df_train_2006[df_train_2006['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2007[df_train_2007['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2008:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2007['チーム'].unique():
                idx = df_train_2007[df_train_2007['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2007['チーム'].unique():
                idx = df_train_2007[df_train_2007['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2008[df_train_2008['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2009:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2008['チーム'].unique():
                idx = df_train_2008[df_train_2008['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2008['チーム'].unique():
                idx = df_train_2008[df_train_2008['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2009[df_train_2009['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2010:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2009['チーム'].unique():
                idx = df_train_2009[df_train_2009['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2009['チーム'].unique():
                idx = df_train_2009[df_train_2009['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2010[df_train_2010['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2011:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2010['チーム'].unique():
                idx = df_train_2010[df_train_2010['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2010['チーム'].unique():
                idx = df_train_2010[df_train_2010['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2011[df_train_2011['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2012:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2011['チーム'].unique():
                idx = df_train_2011[df_train_2011['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2011['チーム'].unique():
                idx = df_train_2011[df_train_2011['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2012[df_train_2012['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2013:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2012['チーム'].unique():
                idx = df_train_2012[df_train_2012['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2012['チーム'].unique():
                idx = df_train_2012[df_train_2012['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2013[df_train_2013['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2014:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2013['チーム'].unique():
                idx = df_train_2013[df_train_2013['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2013['チーム'].unique():
                idx = df_train_2013[df_train_2013['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2014[df_train_2014['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2015:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2014['チーム'].unique():
                idx = df_train_2014[df_train_2014['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2014['チーム'].unique():
                idx = df_train_2014[df_train_2014['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2015[df_train_2015['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2016:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2015['チーム'].unique():
                idx = df_train_2015[df_train_2015['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2015['チーム'].unique():
                idx = df_train_2015[df_train_2015['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2016[df_train_2016['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_train)):
    if df_train['year'][i] == 2017:
        if df_train['section'][i] == '第1節':
            if df_train['home_team'][i] in df_train_2016['チーム'].unique():
                idx = df_train_2016[df_train_2016['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['home_team'][i] == idx['チーム'][j]:
                        df_train['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_train['away_team'][i] in df_train_2016['チーム'].unique():
                idx = df_train_2016[df_train_2016['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_train['away_team'][i] == idx['チーム'][j]:
                        df_train['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_train['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_train_2017[df_train_2017['節'] == '第' + str(int(df_train['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_train['home_team'][i] == idx['チーム'][j]:
                    df_train['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_train['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_train['ホーム総得点'][i] = idx['総得点'][j]
                    df_train['ホーム失点'][i] = idx['失点'][j]
                    df_train['ホーム勝数'][i] = idx['勝'][j]
                    df_train['ホーム分数'][i] = idx['分'][j]
                    df_train['ホーム負数'][i] = idx['負'][j]
                    df_train['ホーム連勝'][i] = idx['連勝'][j]
                    df_train['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_train['away_team'][i] == idx['チーム'][j]:
                    df_train['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_train['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_train['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_train['アウェー総得点'][i] = idx['総得点'][j]
                    df_train['アウェー失点'][i] = idx['失点'][j]
                    df_train['アウェー勝数'][i] = idx['勝'][j]
                    df_train['アウェー分数'][i] = idx['分'][j]
                    df_train['アウェー負数'][i] = idx['負'][j]
                    df_train['アウェー連勝'][i] = idx['連勝'][j]
                    df_train['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_train['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_test)):
    if df_test['year'][i] == 2018:
        if df_test['section'][i] == '第1節':
            if df_test['home_team'][i] in df_train_2017['チーム'].unique():
                idx = df_train_2017[df_train_2017['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_test['home_team'][i] == idx['チーム'][j]:
                        df_test['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_test['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_test['away_team'][i] in df_train_2017['チーム'].unique():
                idx = df_train_2017[df_train_2017['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_test['away_team'][i] == idx['チーム'][j]:
                        df_test['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_test['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_test_2018[df_test_2018['節'] == '第' + str(int(df_test['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_test['home_team'][i] == idx['チーム'][j]:
                    df_test['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_test['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_test['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_test['ホーム総得点'][i] = idx['総得点'][j]
                    df_test['ホーム失点'][i] = idx['失点'][j]
                    df_test['ホーム勝数'][i] = idx['勝'][j]
                    df_test['ホーム分数'][i] = idx['分'][j]
                    df_test['ホーム負数'][i] = idx['負'][j]
                    df_test['ホーム連勝'][i] = idx['連勝'][j]
                    df_test['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_test['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_test['away_team'][i] == idx['チーム'][j]:
                    df_test['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_test['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_test['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_test['アウェー総得点'][i] = idx['総得点'][j]
                    df_test['アウェー失点'][i] = idx['失点'][j]
                    df_test['アウェー勝数'][i] = idx['勝'][j]
                    df_test['アウェー分数'][i] = idx['分'][j]
                    df_test['アウェー負数'][i] = idx['負'][j]
                    df_test['アウェー連勝'][i] = idx['連勝'][j]
                    df_test['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_test['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

for i in range(len(df_test)):
    if df_test['year'][i] == 2019:
        if df_test['section'][i] == '第1節':
            if df_test['home_team'][i] in df_test_2018['チーム'].unique():
                idx = df_test_2018[df_test_2018['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_test['home_team'][i] == idx['チーム'][j]:
                        df_test['home_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_test['home_team_before_rank'][i] = 'J1昇格チーム'

            if df_test['away_team'][i] in df_test_2018['チーム'].unique():
                idx = df_test_2018[df_test_2018['節'] == '第34節'].iloc[0:]
                idx = idx.reset_index(drop=True)
                for j in range(len(idx)):
                    if df_test['away_team'][i] == idx['チーム'][j]:
                        df_test['away_team_before_rank'][i] = '前シーズン' + str(idx['順位'][j]) + '位'
            else:
                df_test['away_team_before_rank'][i] = 'J1昇格チーム'
        else:
            idx = df_test_2019[df_test_2019['節'] == '第' + str(int(df_test['section'][i][1:-1]) - 1) + '節'].iloc[0:]
            idx = idx.reset_index(drop=True)
            for j in range(len(idx)):
                if df_test['home_team'][i] == idx['チーム'][j]:
                    df_test['home_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_test['ホーム勝ち点'][i] = idx['勝ち点'][j]
                    df_test['ホーム得失点差'][i] = idx['得失点差'][j]
                    df_test['ホーム総得点'][i] = idx['総得点'][j]
                    df_test['ホーム失点'][i] = idx['失点'][j]
                    df_test['ホーム勝数'][i] = idx['勝'][j]
                    df_test['ホーム分数'][i] = idx['分'][j]
                    df_test['ホーム負数'][i] = idx['負'][j]
                    df_test['ホーム連勝'][i] = idx['連勝'][j]
                    df_test['ホーム直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_test['ホーム直近5試合負数'][i] = idx['直近5試合負数'][j]
                elif df_test['away_team'][i] == idx['チーム'][j]:
                    df_test['away_team_before_rank'][i] = str(idx['順位'][j]) + '位'
                    df_test['アウェー勝ち点'][i] = idx['勝ち点'][j]
                    df_test['アウェー得失点差'][i] = idx['得失点差'][j]
                    df_test['アウェー総得点'][i] = idx['総得点'][j]
                    df_test['アウェー失点'][i] = idx['失点'][j]
                    df_test['アウェー勝数'][i] = idx['勝'][j]
                    df_test['アウェー分数'][i] = idx['分'][j]
                    df_test['アウェー負数'][i] = idx['負'][j]
                    df_test['アウェー連勝'][i] = idx['連勝'][j]
                    df_test['アウェー直近5試合勝数'][i] = idx['直近5試合勝数'][j]
                    df_test['アウェー直近5試合負数'][i] = idx['直近5試合負数'][j]

In [None]:
df_train['勝ち点差'] = 0
df_train['勝ち点差'] = df_train['ホーム勝ち点'] - df_train['アウェー勝ち点']
df_test['勝ち点差'] = 0
df_test['勝ち点差'] = df_test['ホーム勝ち点'] - df_test['アウェー勝ち点']

In [None]:
df_train['勝ち点差の絶対値'] = 0
df_train['勝ち点差の絶対値'] = abs(df_train['勝ち点差'])
df_test['勝ち点差の絶対値'] = 0
df_test['勝ち点差の絶対値'] = abs(df_test['勝ち点差'])

## 優勝争い

In [None]:
df_train['優勝争い'] = 0
df_test['優勝争い'] = 0

for i in range(len(df_train)):
    if df_train['home_team_before_rank'][i] in ['1位', '2位', '3位']:
        if df_train['away_team_before_rank'][i] in ['1位', '2位', '3位']:
            df_train['優勝争い'][i] = 1

for i in range(len(df_test)):
    if df_test['home_team_before_rank'][i] in ['1位', '2位', '3位']:
        if df_test['away_team_before_rank'][i] in ['1位', '2位', '3位']:
            df_test['優勝争い'][i] = 1

## 残留争い

In [None]:
df_train['残留争い'] = 0
df_test['残留争い'] = 0

for i in range(len(df_train)):
    if df_train['home_team_before_rank'][i] in ['1位', '2位', '3位']:
        if df_train['away_team_before_rank'][i] in ['1位', '2位', '3位']:
            df_train['残留争い'][i] = 1

for i in range(len(df_test)):
    if df_test['home_team_before_rank'][i] in ['18位', '17位', '16位']:
        if df_test['away_team_before_rank'][i] in ['1位', '2位', '3位']:
            df_test['残留争い'][i] = 1

## 試合直前の順位差

In [None]:
def replace_if_starts_with_before(value):
    if isinstance(value, str) and value.startswith('前'):
        return '開幕戦'
    elif isinstance(value, str) and value.startswith('J'):
        return '開幕戦'
    else:
        return value

df_train['home_team_before_rank'] = df_train['home_team_before_rank'].apply(replace_if_starts_with_before)
df_train['away_team_before_rank'] = df_train['away_team_before_rank'].apply(replace_if_starts_with_before)
df_test['home_team_before_rank'] = df_test['home_team_before_rank'].apply(replace_if_starts_with_before)
df_test['away_team_before_rank'] = df_test['away_team_before_rank'].apply(replace_if_starts_with_before)

In [None]:
df_train['diff_rank'] = 0
df_test['diff_rank'] = 0

for i in range(len(df_train)):
    if not df_train['home_team_before_rank'][i] == 'データなし':
        if df_train['home_team_before_rank'][i] == '開幕戦' or df_train['away_team_before_rank'][i] == '開幕戦':
            df_train['diff_rank'][i] = 0
        elif df_train['away_team_before_rank'][i] == '開幕戦':
            df_train['diff_rank'][i] = 0
        else:
            df_train['diff_rank'][i] = int(df_train['home_team_before_rank'][i][:-1]) - int(df_train['away_team_before_rank'][i][:-1])

for i in range(len(df_test)):
    if not df_train['home_team_before_rank'][i] == 'データなし':
        if df_test['home_team_before_rank'][i] == '開幕戦':
            df_test['diff_rank'][i] = 0
        elif df_test['away_team_before_rank'][i] == '開幕戦':
            df_test['diff_rank'][i] = 0
        else:
            df_test['diff_rank'][i] = int(df_test['home_team_before_rank'][i][:-1]) - int(df_test['away_team_before_rank'][i][:-1])

In [None]:
df_train['diff_rankの絶対値'] = 0
df_train['diff_rankの絶対値'] = abs(df_train['diff_rank'])
df_test['diff_rankの絶対値'] = 0
df_test['diff_rankの絶対値'] = abs(df_test['diff_rank'])

## 有名人枠（フォルラン、イニエスタ）

In [None]:
home_columns = [f'home_team_player{i}' for i in range(1, 12)]
away_columns = [f'away_team_player{i}' for i in range(1, 12)]

def contains_famous_player(row):
    famous_players = ['フォルラン', 'アンドレスイニエスタ', 'ダビドビジャ']
    return any(player in famous_players for player in row[home_columns + away_columns])

df_train['famous_player'] = df_train.apply(contains_famous_player, axis=1).astype(int)
df_test['famous_player'] = df_test.apply(contains_famous_player, axis=1).astype(int)

## ホームチームとアウェイチームが同じ県かどうか

In [None]:
def sameprefecture(df):
    same_prefecture = []
    for i in range(len(df)):
        same_prefecture.append(1)
        df['same_prefecture'] = 0

    for i in range(len(df)):
        if df['home_team'][i] == 'G大阪':
            if df['away_team'][i] == 'C大阪':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == 'C大阪':
            if df['away_team'][i] == 'G大阪':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == 'FC東京':
            if df['away_team'][i] == '東京V':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '東京V':
            if df['away_team'][i] == 'FC東京':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '磐田':
            if df['away_team'][i] == '清水':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '清水':
            if df['away_team'][i] == '磐田':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '大宮':
            if df['away_team'][i] == '浦和':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '浦和':
            if df['away_team'][i] == '大宮':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '柏':
            if df['away_team'][i] == '千葉':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '千葉':
            if df['away_team'][i] == '柏':
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '川崎F':
            if df['away_team'][i] in ['横浜FM', '横浜FC', '湘南']:
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '横浜FM':
            if df['away_team'][i] in ['川崎F', '横浜FC', '湘南']:
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '横浜FC':
            if df['away_team'][i] in ['川崎F', '横浜FM', '湘南']:
                df['same_prefecture'][i] = 1
        elif df['home_team'][i] == '湘南':
            if df['away_team'][i] in ['川崎F', '横浜FC', '横浜FM']:
                df['same_prefecture'][i] = 1

    return df

df_train = sameprefecture(df_train)
df_test = sameprefecture(df_test)

## ホームとアウェーの距離

In [None]:
train_team_venue_map = (
    df_train.groupby('home_team')['venue']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
)
train_team_venue_map.columns = ['home_team', 'most_common_venue']

test_team_venue_map = (
    df_test.groupby('home_team')['venue']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
)
test_team_venue_map.columns = ['home_team', 'most_common_venue']

new_teams = test_team_venue_map[~test_team_venue_map['home_team'].isin(train_team_venue_map['home_team'])]

combined_team_venue_map = pd.concat([train_team_venue_map, new_teams]).reset_index(drop=True)

combined_with_venue_info = pd.merge(
    combined_team_venue_map, df_venue,
    how='left',
    left_on='most_common_venue',
    right_on='venue'
)
combined_with_venue_info = combined_with_venue_info.drop(columns=['venue'])
combined_with_venue_info = combined_with_venue_info.rename(columns={'home_team': 'away_team', 'most_common_venue': 'away_venue'})

In [None]:
combined_with_venue_info[['latitude', 'longitude']] = 0, 0

In [None]:
geo = Nominatim(user_agent="myapp")

for index, row in combined_with_venue_info.iterrows():
    location = geo.geocode(row['away_venue'])
    if location:
        combined_with_venue_info.loc[index, 'latitude'] = location.latitude
        combined_with_venue_info.loc[index, 'longitude'] = location.longitude
    else:
        location = geo.geocode(row['address'])
        if location:
            combined_with_venue_info.loc[index, 'latitude'] = location.latitude
            combined_with_venue_info.loc[index, 'longitude'] = location.longitude
        else:
            combined_with_venue_info.loc[index, 'latitude'] = None
            combined_with_venue_info.loc[index, 'longitude'] = None

In [None]:
location = geo.geocode("徳島県鳴門市撫養町立岩")
combined_with_venue_info.at[10, 'latitude'] = location.latitude
combined_with_venue_info.at[10, 'longitude'] = location.longitude

location = geo.geocode("長野県松本市神林")
combined_with_venue_info.at[14, 'latitude'] = location.latitude
combined_with_venue_info.at[14, 'longitude'] = location.longitude

location = geo.geocode("日立柏サッカー場")
combined_with_venue_info.at[15, 'latitude'] = location.latitude
combined_with_venue_info.at[15, 'longitude'] = location.longitude

location = geo.geocode("神奈川県横浜市神奈川区")
combined_with_venue_info.at[16, 'latitude'] = location.latitude
combined_with_venue_info.at[16, 'longitude'] = location.longitude

location = geo.geocode("神奈川県平塚市大原")
combined_with_venue_info.at[20, 'latitude'] = location.latitude
combined_with_venue_info.at[20, 'longitude'] = location.longitude

location = geo.geocode("山梨県甲府市小瀬町")
combined_with_venue_info.at[21, 'latitude'] = location.latitude
combined_with_venue_info.at[21, 'longitude'] = location.longitude

location = geo.geocode("福岡県福岡市博多区東平尾公園2丁目")
combined_with_venue_info.at[24, 'latitude'] = location.latitude
combined_with_venue_info.at[24, 'longitude'] = location.longitude

In [None]:
combined_with_venue_info = combined_with_venue_info.rename(columns={'away_team' : 'home_team'})
df_train = df_train.merge(combined_with_venue_info[['home_team', 'latitude', 'longitude']], on='home_team', how='left')
df_test = df_test.merge(combined_with_venue_info[['home_team', 'latitude', 'longitude']], on='home_team', how='left')
df_train = df_train.rename(columns={'latitude': 'home_latitude', 'longitude': 'home_longitude'})
df_test = df_test.rename(columns={'latitude': 'home_latitude', 'longitude': 'home_longitude'})

In [None]:
combined_with_venue_info = combined_with_venue_info.rename(columns={'home_team': 'away_team'})
df_train = df_train.merge(combined_with_venue_info[['away_team', 'latitude', 'longitude']], on='away_team', how='left')
df_test = df_test.merge(combined_with_venue_info[['away_team', 'latitude', 'longitude']], on='away_team', how='left')
df_train = df_train.rename(columns={'latitude': 'away_latitude', 'longitude': 'away_longitude'})
df_test = df_test.rename(columns={'latitude': 'away_latitude', 'longitude': 'away_longitude'})

In [None]:
def hubeny_distance(lat1, lon1, lat2, lon2):
    """
    ヒュベニの公式を用いて2点間の距離を計算する関数

    Parameters:
    lat1 (float): 1点目の緯度（度）
    lon1 (float): 1点目の経度（度）
    lat2 (float): 2点目の緯度（度）
    lon2 (float): 2点目の経度（度）

    Returns:
    float: 2点間の距離（メートル）
    """

    # 楕円体のパラメータ（GRS80 / WGS84）
    a = 6378137.0        # 赤道半径 (meters)
    b = 6356752.314245   # 極半径 (meters)
    f = (a - b) / a      # 扁平率

    # 度をラジアンに変換
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # 緯度と経度の差
    d_lat = lat2 - lat1
    d_lon = lon2 - lon1

    # 平均緯度
    avg_lat = (lat1 + lat2) / 2.0

    # ヒュベニの公式に必要な計算
    sin_lat = math.sin(avg_lat)
    w = math.sqrt(1 - f * (2 - f) * sin_lat ** 2)
    m = a * (1 - f) / (w ** 3)      # 子午線曲率半径
    n = a / w                       # 卯酉線曲率半径

    # 距離の計算
    d_north = m * d_lat
    d_east = n * math.cos(avg_lat) * d_lon
    distance = math.sqrt(d_north ** 2 + d_east ** 2)

    return distance

In [None]:
def calculate_distance(df):
    df['distance'] = 0
    for i in range(len(df)):
        df.at[i, 'distance'] = hubeny_distance(df['home_latitude'][i], df['home_longitude'][i], df['away_latitude'][i], df['away_longitude'][i])
    return df

df_train = calculate_distance(df_train)
df_test = calculate_distance(df_test)

## 満席率

In [None]:
df_train['attendance_rate'] = df_train['attendance'] / df_train['capacity']

# データ作成

In [None]:
df_train.to_csv('df_train_new.csv')
df_test.to_csv('df_test_new.csv')

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, coo_matrix
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import RFE
import japanize_matplotlib
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')
submit = pd.read_csv('submit.csv', header=None)

# 特徴量選択

In [None]:
df_tr_id = df_train['id']

In [None]:
df_train.columns

In [None]:
df_tr = df_train.drop(['Unnamed: 0', 'id','home_team_player11', 'home_team_player10',
       'home_team_player9','home_team_player8', 'home_team_player7', 'home_team_player6',
       'home_team_player5', 'home_team_player4', 'home_team_player3', 'home_team_player2',
       'home_team_player1', 'away_team_player1', 'away_team_player2', 'away_team_player3',
       'away_team_player4', 'away_team_player5', 'away_team_player6', 'away_team_player7',
       'away_team_player8', 'away_team_player9', 'away_team_player10', 'away_team_player11',
       'address', 'away_address', 'dayofweek'], axis=1)

In [None]:
df_capacity = df_test['capacity']
df_test_id = df_test['id']

In [None]:
test = df_test[['section', 'round', 'home_team_score', 'away_team_score', 'home_team',
       'away_team', 'venue', 'weather', 'temperature', 'humidity',
       'broadcasters', 'capacity', 'day_of_year_sin',
       'day_of_year_cos', 'day_of_month_sin', 'day_of_month_cos',
       'day_of_week_sin', 'day_of_week_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_holiday', 'rest', 'holiday_streak', 'ホーム勝ち点',
       'ホーム得失点差', 'ホーム総得点', 'アウェー勝ち点', 'アウェー得失点差', 'アウェー総得点',
       'home_team_before_rank', 'away_team_before_rank', '勝ち点差', '勝ち点差の絶対値',
       'diff_rank', 'diff_rankの絶対値', 'attendance_mean_section',
       'famous_player', 'same_prefecture', 'distance']]

In [None]:
df = pd.concat([df_tr, test], axis=0)

In [None]:
df['distance'] = df['distance'].astype(int)

In [None]:
df = pd.get_dummies(df, dtype=int, drop_first=True)

In [None]:
def convert_feature_names(df):
    new_columns = []
    for col in df.columns:
        new_col = re.sub('[^\w]', '_', col)
        new_columns.append(new_col)
    df.columns = new_columns
    return df

df = convert_feature_names(df)

In [None]:
train_df = df.iloc[:3661,:]
test_df = df.iloc[3661:,:]

# 訓練

In [None]:
X = train_df.drop(['attendance', 'capacity', 'attendance_rate'], axis=1)
y = train_df[['attendance', 'capacity', 'attendance_rate']]
y = pd.concat([df_tr_id, y], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## LightGBM

In [None]:
max_estimators = 10000
early_stopping_limit = 100

params = {
    'n_estimators': max_estimators,
    'max_depth': 32,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': 42,
    'learning_rate': 0.004,
    'verbose': -1
}

model_lgb = lgb.LGBMRegressor(**params)

def callback(env):
    if env.iteration % 10 == 0:
        print("Iteration:", env.iteration, "\tRMSE:", env.evaluation_result_list[0][2])

model_lgb.fit(
    X_train, y_train['attendance_rate'],
    eval_set=[(X_test, y_test['attendance_rate'])],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_limit), callback]
)

In [None]:
y_pred_lgb = model_lgb.predict(X_test)
y_test = y_test.reset_index(drop=True)
for i in range(len(y_pred_lgb)):
    y_pred_lgb[i] = y_pred_lgb[i] * y_test['capacity'][i]

print('Test RMSE: %.3f' % np.sqrt(mean_squared_error(y_test['attendance'], y_pred_lgb)))

In [None]:
plt.scatter(y_test['attendance'], y_pred_lgb)
plt.plot([0,60000], [0,60000], color='red')
plt.xlabel("measured_value")
plt.ylabel("predict_value")
plt.gca().set_aspect('equal', adjustable='box')
plt.show()

## XGBoost



In [None]:
model_XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model_XGB.fit(X_train, y_train['attendance_rate'], early_stopping_rounds=100,
             eval_set=[(X_test, y_test['attendance_rate'])], verbose=False)

In [None]:
y_pred_XGB = model_XGB.predict(X_test)
for i in range(len(y_pred_XGB)):
    y_pred_XGB[i] = y_pred_XGB[i] * y_test['capacity'][i]

print('Test RMSE: %.3f' % np.sqrt(mean_squared_error(y_test['attendance'], y_pred_XGB)))

In [None]:
plt.scatter(y_test['attendance'], y_pred_XGB)
plt.plot([0,60000], [0,60000], color='red')
plt.xlabel("measured_value")
plt.ylabel("predict_value")
plt.gca().set_aspect('equal', adjustable='box')
plt.show()

## CatBoost

In [None]:
model_CatBoost = CatBoostRegressor(iterations=100000,
                          learning_rate=0.004,
                          depth=5,
                          eval_metric='RMSE',
                          colsample_bylevel=0.8,
                          random_seed = 42,
                          bagging_temperature = 0.2,
                          metric_period = None,
                          early_stopping_rounds=200
                                )
model_CatBoost.fit(X_train, y_train['attendance_rate'],eval_set=(X_test, y_test['attendance_rate']),use_best_model=True,verbose=False)

In [None]:
y_pred_CatBoost = model_CatBoost.predict(X_test)
for i in range(len(y_pred_CatBoost)):
    y_pred_CatBoost[i] = y_pred_CatBoost[i] * y_test['capacity'][i]

print('Test RMSE: %.3f' % np.sqrt(mean_squared_error(y_test['attendance'], y_pred_CatBoost)))

In [None]:
plt.scatter(y_test['attendance'], y_pred_CatBoost)
plt.plot([0,60000], [0,60000], color='red')
plt.xlabel("measured_value")
plt.ylabel("predict_value")
plt.gca().set_aspect('equal', adjustable='box')
plt.show()

# 提出

In [None]:
capa = df_capacity
test_df = test_df.drop(['attendance', 'attendance_rate', 'capacity'], axis=1)

In [None]:
pred = model_CatBoost.predict(test_df)
for i in range(len(submit)):
    submit[1][i] = pred[i] * capa[i]
submit = submit.astype('int')
submit

In [None]:
submit.to_csv('submit.csv', header=False, index=False)