### Import

In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.linear_model import LogisticRegression

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Data Preprocessing 1 : Time Split

In [4]:
train['year'] = train['date'].apply(lambda x : int(x[0:4]))
train['month'] = train['date'].apply(lambda x : int(x[5:7]))
train['day'] = train['date'].apply(lambda x : int(x[8:10]))
train.drop(columns=['date'], inplace=True)

In [5]:
test['year'] = test['date'].apply(lambda x : int(x[0:4]))
test['month'] = test['date'].apply(lambda x : int(x[5:7]))
test['day'] = test['date'].apply(lambda x : int(x[8:10]))
test.drop(columns=['date'], inplace=True)

### Data Preprocessing 2 : Create Future Stats

In [6]:
stats_columns = [
'halfTimeGoals(homeTeam)',
'halfTimeGoals(awayTeam)',
'shots(homeTeam)',
'shots(awayTeam)',
'shotsOnTarget(homeTeam)',
'shotsOnTarget(awayTeam)',
'corners(homeTeam)',
'corners(awayTeam)',
'fouls(homeTeam)',
'fouls(awayTeam)',
'yellowCards(homeTeam)',
'yellowCards(awayTeam)',
'redCards(homeTeam)',
'redCards(awayTeam)'
]

In [7]:
train['match'] = train['homeTeam'] + '-' + train['awayTeam']
pair_stats = train.groupby('match')[stats_columns].mean().reset_index() # match mean

In [8]:
test['match'] = test['homeTeam'] + '-' + test['awayTeam']
test_with_stats = test.merge(pair_stats, on='match', how='left')
test_with_stats.fillna(pair_stats[stats_columns].mean(), inplace=True) # pair_stats mean

### Data Preprocessing 3 : Select x, y

In [9]:
train_x = train.drop(columns=['matchID', 'goals(homeTeam)', 'goals(awayTeam)', 'result'])
train_y = train['result']

test_x = test_with_stats.drop(columns=['matchID'])
test_x = test_x[train_x.columns]

### Data Preprocessing 4 : Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case)
    
    test_x[i] = le.transform(test_x[i])

### Model Setting

In [11]:
model = LogisticRegression(max_iter=100,
                           penalty='l2',
                           C=1.0)

### Model Train and Inference

In [12]:
model.fit(train_x, train_y) 
prediction = model.predict_proba(test_x)

display(model.classes_)
display(prediction)

array(['A', 'D', 'H'], dtype=object)

array([[0.29722315, 0.24675809, 0.45601876],
       [0.2648828 , 0.24670025, 0.48841694],
       [0.33289017, 0.2452336 , 0.42187623],
       [0.33152879, 0.24532218, 0.42314903],
       [0.25432   , 0.24634569, 0.49933431],
       [0.25784276, 0.24648413, 0.49567311],
       [0.2681867 , 0.2467681 , 0.4850452 ],
       [0.32723978, 0.24557817, 0.42718205],
       [0.29240865, 0.24684172, 0.46074964],
       [0.27461579, 0.24687169, 0.47851252],
       [0.33277222, 0.24524221, 0.42198557],
       [0.26902075, 0.24679624, 0.484183  ],
       [0.33310604, 0.24521912, 0.42167484],
       [0.2834218 , 0.24690947, 0.46966872],
       [0.32681354, 0.24561177, 0.4275747 ],
       [0.30360457, 0.24659976, 0.44979568],
       [0.26431146, 0.2466917 , 0.48899684],
       [0.2737302 , 0.24686588, 0.47940391],
       [0.30603782, 0.24654158, 0.44742059],
       [0.3117666 , 0.2463316 , 0.4419018 ],
       [0.333307  , 0.24521167, 0.42148133],
       [0.25805476, 0.24649425, 0.49545099],
       [0.

### Submission

In [13]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,matchID,A,D,H
0,MATCH_9008,0,1,0
1,MATCH_9009,0,1,0
2,MATCH_9010,0,1,0
3,MATCH_9011,0,1,0
4,MATCH_9012,0,1,0
...,...,...,...,...
83,MATCH_9091,0,1,0
84,MATCH_9092,0,1,0
85,MATCH_9093,0,1,0
86,MATCH_9094,0,1,0


In [14]:
sample_submission.iloc[:,1:] = prediction
sample_submission

Unnamed: 0,matchID,A,D,H
0,MATCH_9008,0.297223,0.246758,0.456019
1,MATCH_9009,0.264883,0.246700,0.488417
2,MATCH_9010,0.332890,0.245234,0.421876
3,MATCH_9011,0.331529,0.245322,0.423149
4,MATCH_9012,0.254320,0.246346,0.499334
...,...,...,...,...
83,MATCH_9091,0.333225,0.245209,0.421567
84,MATCH_9092,0.333077,0.245223,0.421700
85,MATCH_9093,0.269740,0.246803,0.483457
86,MATCH_9094,0.266644,0.246744,0.486612


In [15]:
sample_submission.to_csv('baseline_submission.csv', index=False)