In [1]:
import pandas as pd
import numpy as np
from clean_pandas import CleanPandas
from faker import Faker
from collections import defaultdict
import unicodecsv as csv
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder

In [2]:
#### load data
df = pd.read_csv('data/ufc-master.csv')

#### Task Hyperparameters
name_cols = ['R_fighter', 'B_fighter'] # columns that will be pseudo
encr_cols = ['date', 'location'] #columns to be encrypted
drop_cols = ['B_Featherweight_rank', 'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank', 'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'R_td_attempted_bout','B_td_attempted_bout', 'R_td_pct_bout', 'B_td_pct_bout', 'R_sub_attempts_bout', 'B_sub_attempts_bout', 'R_pass_bout', 'B_pass_bout', 'R_rev_bout', 'B_rev_bout'] #columns to be dropped entirely
gen_cols = ['R_odds', 'B_odds', 'R_ev', 'B_ev']#columns to be generalized, numbers
enc_cols = ['Winner', 'country', 'weight_class', 'gender'] #columns to be numerically encoded* has to be declared in line below, too
encode_cols = [(['gender'], LabelEncoder()), (['Winner'], LabelEncoder()), (['country'], LabelEncoder()), (['weight_class'], LabelEncoder())]#columns to be numerically encoded* has to be declared in line above, too
df #print df so that we can select columns

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
0,Robert Whittaker,Darren Till,-130,107,76.923077,107.000000,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,,,,,,
1,Mauricio Rua,Rogerio Nogueira,-190,150,52.631579,150.000000,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,,,,,,
2,Fabricio Werdum,Alexander Gustafsson,260,-335,260.000000,29.850746,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,,,,,,
3,Carla Esparza,Marina Rodriguez,145,-182,145.000000,54.945055,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,,,,,,
4,Paul Craig,Gadzhimurad Antigulov,-137,110,72.992701,110.000000,7/25/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Red,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,Duane Ludwig,Darren Elkins,-155,135,64.516129,135.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,...,0.0,2.0,0.00,0.5,0.0,0.0,0.0,0.0,0.0,0.0
4303,John Howard,Daniel Roberts,-210,175,47.619048,175.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,1.0,1.0,1.00,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4304,Brendan Schaub,Chase Gormley,-260,220,38.461538,220.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,0.0,2.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4305,Mike Pierce,Julio Paulino,-420,335,23.809524,335.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,10.0,0.0,0.60,0.0,0.0,0.0,6.0,0.0,0.0,0.0


In [3]:
def anon_df(df, cols, func):
    
    """
    df is dataframe to anonymize, cols is ['col1', 'col2'...etc] and func is one of the functions using faker defined below
    the functions will be applied to the columns.
    """
    
    for name in cols:
        keys = {cat: i for i, cat in enumerate (df[name].unique())}
        df[name] = df[name].map(func)
    return df

In [4]:
def generalize(df, col):
    
    """
    df is dataframe, col is columns. for each column the bins(spans) are calculated by taking min and max
    and then generating 10 evenly distributed numbers in between. Every value of column is then sorted into its corresponding span
    """
    for i in range(len(col)):
        bins = {}
        min_span = df[col[i]].min()
        max_span = df[col[i]].max()
        bins = np.linspace(min_span, max_span, num=10, endpoint=True)
        df[col[i]] = pd.cut(x=df[col[i]], bins=bins)
    return df

In [5]:
def full_name_pseud(names): #if names occur in full name format
    if names not in key:
        pseud = fake.name()
        while (pseud in key.values()) or (pseud in key):
            pseud = fake.name()
        key[names] = pseud
        return pseud
    else:
        return key[names]

def first_name_pseud(names): #if names occur as first name
    if names not in key:
        pseud = fake.first_name()
        while (pseud in key.values()) or (pseud in key):
            pseud = fake.first_name()
        key[names] = pseud
        return pseud
    else:
        return key[names]

def last_name_pseud(names): #if names occur as last name
    if names not in key:
        pseud = fake.last_name()
        while (pseud in key.values()) or (pseud in key):
            pseud = fake.last_name()
        key[names] = pseud
        return pseud
    else:
        return key[names]


In [6]:
##### Encoding of Attributes
encode_map = DataFrameMapper(encode_cols, df_out = True)
encoded_cols = encode_map.fit_transform(df.copy())
df = pd.concat([df.drop(columns=enc_cols), encoded_cols], axis='columns')


##### Pseudonymization
key = {}
fake = Faker()#this also takes locale from faker, so 'en_US', 'ja_JP' or 'de_DE' is possible


anon_df(df, name_cols, full_name_pseud) 
"""apply faker pseudonymization: https://faker.readthedocs.io/en/master/providers/baseprovider.html
in theory this can be done with any faker provider, but in our case we do not want to generate fake numerical data, only maybe names
"""


#### Generalizing numbers into ranges
generalize(df, gen_cols)


#### Encryption of Attributes
result_df, encryption_key, dtype_dict = df.clean_pandas.encrypt(encr_cols) #encrypt

  y = column_or_1d(y, warn=True)


In [7]:
### Drop columns not needed
result_df.drop(columns=drop_cols, inplace=True)

In [8]:
#### Decryption. Passing encr_cols decrypts everything, but indexing works
#result_df.clean_pandas.decrypt(encr_cols[1], encryption_key, dtype_dict)[encr_cols[1]]

In [9]:
#### Final data

result_df


Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,title_bout,no_of_rounds,...,R_tot_str_landed_bout,B_tot_str_landed_bout,R_tot_str_attempted_bout,B_tot_str_attempted_bout,R_td_landed_bout,B_td_landed_bout,gender,Winner,country,weight_class
0,Marilyn Murphy,Joseph Howell,"(-325.0, -50.0]","(-88.889, 188.889]","(5.882, 91.34]","(8.333, 151.852]",b'gAAAAABfJHj8234GpUdN4Hgme-lKBi-1p-UR57eFUlP3...,b'gAAAAABfJHj8EXjoEjGHND7V4P-U0CT1DPHrFfv6MPBN...,False,5,...,,,,,,,1,1,27,7
1,John West,Javier Young,"(-325.0, -50.0]","(-88.889, 188.889]","(5.882, 91.34]","(8.333, 151.852]",b'gAAAAABfJHj8234GpUdN4Hgme-lKBi-1p-UR57eFUlP3...,b'gAAAAABfJHj8EXjoEjGHND7V4P-U0CT1DPHrFfv6MPBN...,False,3,...,,,,,,,1,1,27,5
2,Antonio Clark,Robert Cooper,"(225.0, 500.0]","(-366.667, -88.889]","(176.797, 262.255]","(8.333, 151.852]",b'gAAAAABfJHj8234GpUdN4Hgme-lKBi-1p-UR57eFUlP3...,b'gAAAAABfJHj8EXjoEjGHND7V4P-U0CT1DPHrFfv6MPBN...,False,3,...,,,,,,,1,1,27,4
3,Jeffrey Downs,Margaret Edwards DDS,"(-50.0, 225.0]","(-366.667, -88.889]","(91.34, 176.797]","(8.333, 151.852]",b'gAAAAABfJHj8234GpUdN4Hgme-lKBi-1p-UR57eFUlP3...,b'gAAAAABfJHj8EXjoEjGHND7V4P-U0CT1DPHrFfv6MPBN...,False,3,...,,,,,,,0,1,27,12
4,Erica Daniel,James Davis,"(-325.0, -50.0]","(-88.889, 188.889]","(5.882, 91.34]","(8.333, 151.852]",b'gAAAAABfJHj8234GpUdN4Hgme-lKBi-1p-UR57eFUlP3...,b'gAAAAABfJHj8EXjoEjGHND7V4P-U0CT1DPHrFfv6MPBN...,False,3,...,,,,,,,1,1,27,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,Matthew Robinson,Jason Oliver,"(-325.0, -50.0]","(-88.889, 188.889]","(5.882, 91.34]","(8.333, 151.852]",b'gAAAAABfJHj8pEBfdyvy1XlJxDeYJs1ihezik9N_g05p...,b'gAAAAABfJHj8ISMCbrgUApASTQ49VGYrPMDR4nZWmeyw...,False,3,...,1.0,0.0,5.0,2.0,0.0,1.0,1,0,21,6
4303,Ethan Miller,Cameron Brown,"(-325.0, -50.0]","(-88.889, 188.889]","(5.882, 91.34]","(151.852, 295.37]",b'gAAAAABfJHj8pEBfdyvy1XlJxDeYJs1ihezik9N_g05p...,b'gAAAAABfJHj8ISMCbrgUApASTQ49VGYrPMDR4nZWmeyw...,False,3,...,8.0,21.0,11.0,33.0,1.0,1.0,1,1,21,8
4304,Jesse Bennett,Regina Medina,"(-325.0, -50.0]","(188.889, 466.667]","(5.882, 91.34]","(151.852, 295.37]",b'gAAAAABfJHj8pEBfdyvy1XlJxDeYJs1ihezik9N_g05p...,b'gAAAAABfJHj8ISMCbrgUApASTQ49VGYrPMDR4nZWmeyw...,False,3,...,30.0,2.0,41.0,6.0,0.0,0.0,1,1,21,4
4305,Michelle Peters,Brandy Martin,"(-600.0, -325.0]","(188.889, 466.667]","(5.882, 91.34]","(295.37, 438.889]",b'gAAAAABfJHj8pEBfdyvy1XlJxDeYJs1ihezik9N_g05p...,b'gAAAAABfJHj8ISMCbrgUApASTQ49VGYrPMDR4nZWmeyw...,False,3,...,90.0,31.0,131.0,76.0,6.0,0.0,1,1,21,8
