In [None]:
from nbdev import *
# default_exp utility

# Utility functions
> Basic functions that didn't fit anywhere else

## Multiprocessing

In [None]:
# export
import multiprocessing as mp

Python offers an easy way to have a function run on arguments of a list in parallel. Unfortunately, the jupyter environment (on windows?) does not support this feature (yet?). So in order to actually use multiprocessing, we have to write a traditional .py script which uses the below function.

In [None]:
# exports
def run_in_parallel(func, iterable, n_workers=6):
    pool = mp.Pool(processes=n_workers)
    return pool.map(func, iterable)

## Pandas


In [None]:
# export
import pandas as pd

In [None]:
original = pd.DataFrame([[1,2,3], [4,5,6]], columns=['A','B','C'])
original

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


In [None]:
additional = pd.DataFrame([[1,2,9,10], [4,5,13,14], [4,6,13,14]], columns=['A','B','E','F'])
additional.head()

Unnamed: 0,A,B,E,F
0,1,2,9,10
1,4,5,13,14
2,4,6,13,14


In [None]:
# export
def overlapping_cols(df1, df2):
    return [col1 for col1 in df1.columns if col1 in df2.columns]


def merge_with_prefix(original, additional, prefix, left_on=None, right_on=None, drop_additional=False):
    assert not ((left_on is not None) ^ (right_on is not None)), "Leave both left_on and right_on as None to merge over all columns with the same name or specify both."
    
    if right_on is None:
        left_on = overlapping_cols(original, additional)
        right_on = left_on
        
    col_renamer = {col: prefix+col for col in additional.columns if not col in right_on}
    to_merge = additional.rename(col_renamer, axis='columns')

    merged = pd.merge(original, to_merge, left_on=left_on, right_on=right_on)
    
    if drop_additional:
        to_drop = [col for col in right_on if not col in left_on]
        merged.drop(columns=to_drop, inplace=True)
    
    return merged

In [None]:
merge_with_prefix(original, additional, 'home_')

Unnamed: 0,A,B,C,home_E,home_F
0,1,2,3,9,10
1,4,5,6,13,14


In [None]:
merge_with_prefix(original, additional, 'one_', left_on=['A'], right_on=['A'])

Unnamed: 0,A,B,C,one_B,one_E,one_F
0,1,2,3,2,9,10
1,4,5,6,5,13,14
2,4,5,6,6,13,14


In [None]:
merge_with_prefix(original, additional, 'two_', left_on=['A', 'B'], right_on=['A', 'B'])

Unnamed: 0,A,B,C,two_E,two_F
0,1,2,3,9,10
1,4,5,6,13,14


In [None]:
original

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


In [None]:
different_names = pd.DataFrame([[4,7,8]], columns=['D', 'E', 'F'])
different_names

Unnamed: 0,D,E,F
0,4,7,8


In [None]:
merge_with_prefix(original, different_names, 'other_', left_on=['A'], right_on=['D'])

Unnamed: 0,A,B,C,D,other_E,other_F
0,4,5,6,4,7,8


In [None]:
merge_with_prefix(original, different_names, 'other_', left_on=['A'], right_on=['D'], drop_additional=True)

Unnamed: 0,A,B,C,other_E,other_F
0,4,5,6,7,8


## Translation with SequenceMatcher

In [None]:
# export
import numpy as np
from difflib import SequenceMatcher

In [None]:
actuals = ['Bayern Munich', 'Wolfsburg', 'Schalke', 'Nurnberg', 'Mainz', 'Hansa Rostock', 'B. Monchengladbach', 'Bochum', 'Hannover', 'Kaiserslautern', 'VfB Stuttgart', 'Hamburger SV', 'Freiburg', 'Bayer Leverkusen', 'Dortmund', 'Arminia Bielefeld', 'Hertha Berlin', 'Werder Bremen', 'Duisburg', 'Eintracht Frankfurt', 'Alemannia Aachen', 'Energie Cottbus', 'Karlsruher', 'Hoffenheim', 'Augsburg', 'Greuther Furth', 'Dusseldorf', 'Braunschweig', 'Paderborn', 'Köln']
targets = ['SV Werder', 'Leverkusen', 'FC Bayern', 'Hamburger SV', 'Schalke 04', '1.FC Köln', 'Dortmund', 'Hannover 96', 'Hertha BSC', 'Wolfsburg', 'Duisburg', 'VfB Stuttgart', "K'lautern", 'Mainz 05', 'Bielefeld', 'Frankfurt', 'Nürnberg', "M'gladbach", 'Bochum', 'Cottbus', 'Aachen', 'Karlsruhe', 'Rostock', 'Hoffenheim', 'SC Freiburg', 'St. Pauli', 'Augsburg', 'Düsseldorf', 'Gr. Fürth', 'Braunschweig', 'Paderborn', 'Ingolstadt', 'Darmstadt', 'RB Leipzig', 'Union Berlin']

In [None]:
# export
def similar_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

def similar_n(actual, targets, n=5):
    scores = np.array([similar_score(actual, target) for target in targets])
    ids = scores.argsort()[-n:]
    return [targets[id] for id in reversed(ids)]

In [None]:
actuals[0]

'Bayern Munich'

In [None]:
similar_n(actuals[0], targets)

['FC Bayern', 'Braunschweig', 'Paderborn', "K'lautern", 'Hannover 96']

In [None]:
# export
def create_translation(actuals, targets):
    return {actual: similar_n(actual, targets, n=1)[0] for actual in actuals}

In [None]:
translation = create_translation(actuals, targets)
translation

{'Bayern Munich': 'FC Bayern',
 'Wolfsburg': 'Wolfsburg',
 'Schalke': 'Schalke 04',
 'Nurnberg': 'Nürnberg',
 'Mainz': 'Mainz 05',
 'Hansa Rostock': 'Rostock',
 'B. Monchengladbach': "M'gladbach",
 'Bochum': 'Bochum',
 'Hannover': 'Hannover 96',
 'Kaiserslautern': "K'lautern",
 'VfB Stuttgart': 'VfB Stuttgart',
 'Hamburger SV': 'Hamburger SV',
 'Freiburg': 'SC Freiburg',
 'Bayer Leverkusen': 'Leverkusen',
 'Dortmund': 'Dortmund',
 'Arminia Bielefeld': 'Bielefeld',
 'Hertha Berlin': 'Hertha BSC',
 'Werder Bremen': 'SV Werder',
 'Duisburg': 'Duisburg',
 'Eintracht Frankfurt': 'Frankfurt',
 'Alemannia Aachen': 'Aachen',
 'Energie Cottbus': 'Cottbus',
 'Karlsruher': 'Karlsruhe',
 'Hoffenheim': 'Hoffenheim',
 'Augsburg': 'Augsburg',
 'Greuther Furth': 'Gr. Fürth',
 'Dusseldorf': 'Düsseldorf',
 'Braunschweig': 'Braunschweig',
 'Paderborn': 'Paderborn',
 'Köln': '1.FC Köln'}

In [None]:
# export
def reverse_dict(d):
    return {value: key for key,value in d.items()}

In [None]:
reverse_dict(translation)

{'FC Bayern': 'Bayern Munich',
 'Wolfsburg': 'Wolfsburg',
 'Schalke 04': 'Schalke',
 'Nürnberg': 'Nurnberg',
 'Mainz 05': 'Mainz',
 'Rostock': 'Hansa Rostock',
 "M'gladbach": 'B. Monchengladbach',
 'Bochum': 'Bochum',
 'Hannover 96': 'Hannover',
 "K'lautern": 'Kaiserslautern',
 'VfB Stuttgart': 'VfB Stuttgart',
 'Hamburger SV': 'Hamburger SV',
 'SC Freiburg': 'Freiburg',
 'Leverkusen': 'Bayer Leverkusen',
 'Dortmund': 'Dortmund',
 'Bielefeld': 'Arminia Bielefeld',
 'Hertha BSC': 'Hertha Berlin',
 'SV Werder': 'Werder Bremen',
 'Duisburg': 'Duisburg',
 'Frankfurt': 'Eintracht Frankfurt',
 'Aachen': 'Alemannia Aachen',
 'Cottbus': 'Energie Cottbus',
 'Karlsruhe': 'Karlsruher',
 'Hoffenheim': 'Hoffenheim',
 'Augsburg': 'Augsburg',
 'Gr. Fürth': 'Greuther Furth',
 'Düsseldorf': 'Dusseldorf',
 'Braunschweig': 'Braunschweig',
 'Paderborn': 'Paderborn',
 '1.FC Köln': 'Köln'}

## Export

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_scraping.ipynb.
Converted 01_utility.ipynb.
Converted 02_data.ipynb.
Converted 03_sportde.ipynb.
Converted 04_feature_engineering.ipynb.
Converted index.ipynb.
