In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re


## Competitions

In [19]:
competitions = pd.read_json('../metadata/competitions.json')

competitions

Unnamed: 0,category,deadline,files,ref,reward,teamCount,userHasEntered
0,Getting Started,2030-01-01 00:00:00,"[{'name': 'data_description.txt', 'size': '13K...",house-prices-advanced-regression-techniques,Knowledge,4692,False
1,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '73MB', 'creati...",digit-recognizer,Knowledge,2572,False
2,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '60KB', 'creati...",titanic,Knowledge,10168,True
3,Research,2029-12-31 07:00:00,[{'name': 'imagenet_object_localization.tar.gz...,imagenet-object-localization-challenge,Knowledge,21,False
4,Playground,2019-01-01 23:59:00,"[{'name': 'item_categories.csv', 'size': '3KB'...",competitive-data-science-predict-future-sales,Kudos,1181,True
5,Featured,2018-10-19 23:59:00,"[{'name': 'depths.csv', 'size': '322KB', 'crea...",tgs-salt-identification-challenge,"$100,000",956,False
6,Featured,2018-10-04 23:59:00,"[{'name': 'sample_submission.csv', 'size': '2M...",airbus-ship-detection,"$60,000",212,False
7,Playground,2018-09-25 23:59:00,"[{'name': 'sample_submission.csv', 'size': '33...",new-york-city-taxi-fare-prediction,Knowledge,368,False
8,Playground,2018-09-24 23:59:00,"[{'name': 'check_correlation.csv.zip', 'size':...",flavours-of-physics-kernels-only,Knowledge,31,False
9,Playground,2018-09-24 23:59:00,"[{'name': 'test.csv.zip', 'size': '12MB', 'cre...",forest-cover-type-kernels-only,Knowledge,188,False


## Feature Engineering

In [3]:
from re import sub
from decimal import Decimal

def money_to_float (row):
    if '$' in row.reward:
        return float(sub(r'[^\d.]', '', row.reward))
    return 0

competitions['rewardInDollar'] = competitions.apply (money_to_float, axis=1)

competitions

Unnamed: 0,category,deadline,files,ref,reward,teamCount,userHasEntered,rewardInDollar
0,Getting Started,2030-01-01 00:00:00,"[{'name': 'data_description.txt', 'size': '13K...",house-prices-advanced-regression-techniques,Knowledge,4692,False,0.0
1,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '73MB', 'creati...",digit-recognizer,Knowledge,2572,False,0.0
2,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '60KB', 'creati...",titanic,Knowledge,10168,True,0.0
3,Research,2029-12-31 07:00:00,[{'name': 'imagenet_object_localization.tar.gz...,imagenet-object-localization-challenge,Knowledge,21,False,0.0
4,Playground,2019-01-01 23:59:00,"[{'name': 'item_categories.csv', 'size': '3KB'...",competitive-data-science-predict-future-sales,Kudos,1181,True,0.0
5,Featured,2018-10-19 23:59:00,"[{'name': 'depths.csv', 'size': '322KB', 'crea...",tgs-salt-identification-challenge,"$100,000",956,False,100000.0
6,Featured,2018-10-04 23:59:00,"[{'name': 'sample_submission.csv', 'size': '2M...",airbus-ship-detection,"$60,000",212,False,60000.0
7,Playground,2018-09-25 23:59:00,"[{'name': 'sample_submission.csv', 'size': '33...",new-york-city-taxi-fare-prediction,Knowledge,368,False,0.0
8,Playground,2018-09-24 23:59:00,"[{'name': 'check_correlation.csv.zip', 'size':...",flavours-of-physics-kernels-only,Knowledge,31,False,0.0
9,Playground,2018-09-24 23:59:00,"[{'name': 'test.csv.zip', 'size': '12MB', 'cre...",forest-cover-type-kernels-only,Knowledge,188,False,0.0


### Create addtional file based features

Attention many files are compressed. The actual size will be bigger.

In [4]:
p_size = re.compile('\d+')
p_type = re.compile('\D+')

def file_size_kb(size):
    file_size=p_size.search(size).group()
    file_type=p_type.search(size).group()
    return int(file_size) * file_type_factor(file_type)


def file_type_factor(file_type):
    if file_type == 'KB':
        return 1/1024
    if file_type == 'MB':
        return 1
    if file_type == 'GB':
        return 1024
    if file_type == 'TB':
        return 1024 * 1024
    return 1



In [5]:
def file_size (row):
    fileSizes = [file_size_kb(f['size']) for f in row.files]
    return sum(fileSizes)

def file_size_without_test (row):
    fileSizes = [file_size_kb(f['size']) for f in row.files if 'test' not in f['name']]
    return sum(fileSizes)

def file_type (row):
    csv = len([f for f in row.files if 'csv' in f['name']]) == len(row.files)
    txt = len([f for f in row.files if 'txt' in f['name']]) == len(row.files)
    json = len([f for f in row.files if 'json' in f['name']]) == len(row.files)

    image = len([f for f in row.files if 'jpg' in f['name']]) > 0
    
    if csv:
        return 'csv'
    if txt:
        return 'txt'
    if json:
        return 'json'
    if image:
        return 'image'
    return 'other'

competitions['fileCount'] = competitions.apply(lambda row: len(row.files), axis=1) 
competitions['fileSizeMB'] = competitions.apply(file_size, axis=1)
competitions['fileSizeWithoutTestSetMB'] = competitions.apply(file_size_without_test, axis=1)

competitions['fileType'] = competitions.apply(file_type, axis=1)

competitions.head()

Unnamed: 0,category,deadline,files,ref,reward,teamCount,userHasEntered,rewardInDollar,fileCount,fileSizeMB,fileSizeWithoutTestSetMB,fileType
0,Getting Started,2030-01-01 00:00:00,"[{'name': 'data_description.txt', 'size': '13K...",house-prices-advanced-regression-techniques,Knowledge,4692,False,0.0,7,1.094727,0.583984,other
1,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '73MB', 'creati...",digit-recognizer,Knowledge,2572,False,0.0,3,122.229492,73.229492,csv
2,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '60KB', 'creati...",titanic,Knowledge,10168,True,0.0,3,0.088867,0.061523,csv
3,Research,2029-12-31 07:00:00,[{'name': 'imagenet_object_localization.tar.gz...,imagenet-object-localization-challenge,Knowledge,21,False,0.0,5,158727.197266,158727.197266,other
4,Playground,2019-01-01 23:59:00,"[{'name': 'item_categories.csv', 'size': '3KB'...",competitive-data-science-predict-future-sales,Kudos,1181,True,0.0,6,15.463867,14.463867,csv


## Evaluation Algorithms

In [6]:
metrics = pd.read_json('../metadata/metrics.json')

# isMax (bigger value is better than small value)
metrics

Unnamed: 0,competitionId,description,id,isMax,name,ref,requiresSameRowCountInSubmissionAndSolution,submissionFileFormatId
0,,Square root of the average of the squared natu...,8,False,Root Mean Squared Logarithmic Error,house-prices-advanced-regression-techniques,True,
1,,Percentage of correctly categorized items,14,True,Categorization Accuracy,digit-recognizer,True,
2,,Percentage of correctly categorized items,14,True,Categorization Accuracy,titanic,True,
3,,ImageNetObjectLocalization,310,False,ImageNetObjectLocalization,imagenet-object-localization-challenge,True,
4,,Square root of the average of the squared errors.,2,False,Root Mean Squared Error,competitive-data-science-predict-future-sales,True,
5,,Evaluates multi-object segmentation quality us...,316,True,IntersectionOverUnionObjectSegmentation,tgs-salt-identification-challenge,False,
6,,Evaluates multi-object segmentation quality av...,321,True,IntersectionOverUnionObjectSegmentationBeta,airbus-ship-detection,False,
7,,Square root of the average of the squared errors.,2,False,Root Mean Squared Error,new-york-city-taxi-fare-prediction,True,
8,,Weighted Area Under Receiver Operating Charact...,288,True,"Weighted AUC, with agreement check and correla...",flavours-of-physics-kernels-only,True,
9,,Percentage of correctly categorized items,14,True,Categorization Accuracy,forest-cover-type-kernels-only,True,


In [7]:
metrics.name.value_counts()

Area Under Receiver Operating Characteristic Curve                    50
Categorization Accuracy                                               25
                                                                      22
Log Loss                                                              22
Root Mean Squared Error                                               22
Root Mean Squared Logarithmic Error                                   21
Multiclass Loss                                                       15
Mean Absolute Error                                                   12
Mean F-Score                                                           7
Mean Average Precision at K                                            5
Multiclass Loss (Deprecated)                                           5
Custom Evaluation Metric                                               4
Mean Columnwise Area Under Receiver Operating Characteristic Curve     4
Normalized Gini Index                              

## Leaderboard

In [8]:
leaderboard = pd.read_json('../metadata/leaderboards.json')

leaderboard['best'] = pd.to_numeric(leaderboard['best'], errors='coerce')

leaderboard.head()

Unnamed: 0,best,ref,top
0,0.0,house-prices-advanced-regression-techniques,"[{'teamId': 1780632, 'teamName': 'GroundTruth'..."
1,1.0,digit-recognizer,"[{'teamId': 1738338, 'teamName': 'Xianbin Guo'..."
2,1.0,titanic,"[{'teamId': 1711461, 'teamName': 'povahagn', '..."
3,0.03008,imagenet-object-localization-challenge,"[{'teamId': 1826466, 'teamName': 'Yohnkey', 's..."
4,0.85686,competitive-data-science-predict-future-sales,"[{'teamId': 1597395, 'teamName': 'Pieter Volos..."


## Merge

In [9]:
comp_leader = pd.merge(competitions, leaderboard, on='ref')
df = pd.merge(comp_leader, metrics, on='ref')

df.head()

Unnamed: 0,category,deadline,files,ref,reward,teamCount,userHasEntered,rewardInDollar,fileCount,fileSizeMB,...,fileType,best,top,competitionId,description,id,isMax,name,requiresSameRowCountInSubmissionAndSolution,submissionFileFormatId
0,Getting Started,2030-01-01 00:00:00,"[{'name': 'data_description.txt', 'size': '13K...",house-prices-advanced-regression-techniques,Knowledge,4692,False,0.0,7,1.094727,...,other,0.0,"[{'teamId': 1780632, 'teamName': 'GroundTruth'...",,Square root of the average of the squared natu...,8,False,Root Mean Squared Logarithmic Error,True,
1,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '73MB', 'creati...",digit-recognizer,Knowledge,2572,False,0.0,3,122.229492,...,csv,1.0,"[{'teamId': 1738338, 'teamName': 'Xianbin Guo'...",,Percentage of correctly categorized items,14,True,Categorization Accuracy,True,
2,Getting Started,2030-01-01 00:00:00,"[{'name': 'train.csv', 'size': '60KB', 'creati...",titanic,Knowledge,10168,True,0.0,3,0.088867,...,csv,1.0,"[{'teamId': 1711461, 'teamName': 'povahagn', '...",,Percentage of correctly categorized items,14,True,Categorization Accuracy,True,
3,Research,2029-12-31 07:00:00,[{'name': 'imagenet_object_localization.tar.gz...,imagenet-object-localization-challenge,Knowledge,21,False,0.0,5,158727.197266,...,other,0.03008,"[{'teamId': 1826466, 'teamName': 'Yohnkey', 's...",,ImageNetObjectLocalization,310,False,ImageNetObjectLocalization,True,
4,Playground,2019-01-01 23:59:00,"[{'name': 'item_categories.csv', 'size': '3KB'...",competitive-data-science-predict-future-sales,Kudos,1181,True,0.0,6,15.463867,...,csv,0.85686,"[{'teamId': 1597395, 'teamName': 'Pieter Volos...",,Square root of the average of the squared errors.,2,False,Root Mean Squared Error,True,


In [10]:
df.shape

(294, 21)

### One-Hot Encoding

In [11]:

categorical_features = df.select_dtypes(include=[np.object])
categorical_features.columns


Index(['category', 'deadline', 'files', 'ref', 'reward', 'fileType', 'top',
       'description', 'name'],
      dtype='object')

In [12]:
# one-hot encoding of categorical variables
encoded_df = pd.get_dummies(df, columns=['category', 'fileType'])

encoded_df.shape

(294, 29)

## Analyse classification problems

In [13]:
# AUC 
classification_df = encoded_df.query('(id == 5 or id == 14) and best != 0')

classification_df.best.describe()

count    74.000000
mean      0.903151
std       0.111309
min       0.545710
25%       0.839338
50%       0.956440
75%       0.990195
max       1.000000
Name: best, dtype: float64

In [14]:
classification_df.corr()['best'].sort_values()

rewardInDollar                                -0.284296
id                                            -0.141931
category_Research                             -0.084582
category_Featured                             -0.077228
category_Recruitment                          -0.056947
teamCount                                     -0.018706
fileType_csv                                  -0.011640
fileType_other                                 0.011640
category_Playground                            0.029826
fileSizeMB                                     0.034030
fileSizeWithoutTestSetMB                       0.035834
fileCount                                      0.051499
userHasEntered                                 0.058658
category_Getting Started                       0.252756
best                                           1.000000
competitionId                                       NaN
isMax                                               NaN
requiresSameRowCountInSubmissionAndSolution     

In [15]:
# Reward in Dollar
filtered_df = classification_df

plt.figure()
plt.title('Reward in Dollar')
plt.scatter(filtered_df['rewardInDollar'], filtered_df['best'], marker= 'o', s=20)
plt.show()

<IPython.core.display.Javascript object>

In [20]:
# Team Count
filtered_df = classification_df

plt.figure()
plt.title('Team Count')
plt.scatter(filtered_df['teamCount'], filtered_df['best'], marker= 'o', s=20)
plt.show()

<IPython.core.display.Javascript object>

141    0.94313
238    0.83196
247    0.84248
Name: best, dtype: float64