In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [3]:
def pred_score(y_test, y_prob):
    macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                    average="macro")
    weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                        average="weighted")
    macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                    average="macro")
    weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                        average="weighted")
    print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
        "(weighted by prevalence)"
        .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
    print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
        "(weighted by prevalence)"
        .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))

In [4]:
df=pd.read_csv('train.csv').drop(['matchId','playerId'],axis=1)
df_test=pd.read_csv('test.csv')

df.head()

Unnamed: 0,playType,bodyPart,x,y,interveningOpponents,interveningTeammates,interferenceOnShooter,minute,second,outcome
0,جریان بازی,پای راست,13.47,-11.22,1,0,متوسط,70,9,گُل
1,جریان بازی,پای چپ,9.48,14.22,3,0,متوسط,55,4,مهار توسط دروازه بان
2,ضربه آزاد مستقیم,پای چپ,29.43,-1.25,6,2,کم,86,31,مهار توسط دروازه بان
3,جریان بازی,پای راست,26.93,1.0,4,1,متوسط,77,2,موقعیت از دست رفته
4,جریان بازی,پای راست,10.72,5.24,2,0,متوسط,76,46,گُل


In [5]:
y=df['outcome'].copy()
y[(y=='گُل') | (y=='گُل به خودی')]=1
y[(y=='برخورد به تیردروازه') |(y=='مهار توسط دروازه بان') |( y=='موقعیت از دست رفته') |(y=='برخورد به دفاع')]=0

X=df.drop('outcome',axis=1)

X_test=df_test

X.loc[X['interferenceOnShooter'].isnull(), 'interferenceOnShooter'] = 'بیش از یک بازیکن'
X_test.loc[X_test['interferenceOnShooter'].isnull(), 'interferenceOnShooter'] = 'بیش از یک بازیکن'


# X.drop(['minute','second'],axis=1)
# X_test.drop(['minute','second'],axis=1)


In [6]:
# Shape of training data (num_rows, num_columns)
print("train size is {}".format(X.shape))
print("test size is {}".format(X_test.shape))

# Number of missing values in each column of training data
missing_val_count_by_column = (X.isnull().sum())
print("missing val count by column {}".format(missing_val_count_by_column[missing_val_count_by_column > 0]))

train size is (8925, 9)
test size is (2000, 9)
missing val count by column Series([], dtype: int64)


In [7]:
# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['playType', 'bodyPart', 'interferenceOnShooter']


In [8]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='error', sparse=False,dtype=np.int64)
train_X_encoded=OH_encoder.fit_transform(X[object_cols])
column_name=OH_encoder.get_feature_names(object_cols)

OH_cols_train = pd.DataFrame(train_X_encoded, columns= column_name)
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]), columns= column_name)

# One-hot encoding removed index; put it back
OH_cols_train.index = X.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X = X.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X = pd.concat([num_X, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
print('train shape',OH_X.shape)
print('test shape',OH_X_test.shape)

train shape (8925, 18)
test shape (2000, 18)


In [9]:
y=y.astype('int')
x_train, x_test, y_train, y_test = train_test_split(OH_X, y,test_size=0.35,random_state=0)


In [16]:
from sklearn import svm
SVM = svm.SVC(probability=True)
SVM.fit(OH_X, y)
y_prob=SVM.predict_proba(OH_X_test)

In [19]:
df=pd.DataFrame(y_prob[:,1])
df.to_csv('output.csv')