# Bonus C：Pandas的效能、除錯與測試

## C.1 轉換資料

In [1]:
import pandas as pd
import numpy as np
import zipfile
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)
url = 'data/kaggle-survey-2018.zip'

with zipfile.ZipFile(url) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

['multipleChoiceResponses.csv', 'freeFormResponses.csv', 'SurveySchema.csv']


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
df.T

Unnamed: 0,1,2,...,23858,23859
Time from Start to Finish (seconds),710,434,...,36,502
Q1,Female,Male,...,Male,Male
Q1_OTHER_TEXT,-1,-1,...,-1,-1
Q2,45-49,30-34,...,25-29,25-29
Q3,United S...,Indonesia,...,United K...,Spain
...,...,...,...,...,...
Q50_Part_5,,,...,,
Q50_Part_6,,,...,,
Q50_Part_7,,,...,,
Q50_Part_8,,,...,,


In [3]:
df.dtypes

Time from Start to Finish (seconds)    object
Q1                                     object
Q1_OTHER_TEXT                          object
Q2                                     object
Q3                                     object
                                        ...  
Q50_Part_5                             object
Q50_Part_6                             object
Q50_Part_7                             object
Q50_Part_8                             object
Q50_OTHER_TEXT                         object
Length: 395, dtype: object

In [4]:
df.dtypes.value_counts(dropna=False)

object    395
dtype: int64

In [5]:
df.Q1.value_counts(dropna=False)

Male                       19430
Female                      4010
Prefer not to say            340
Prefer to self-describe       79
Name: Q1, dtype: int64

In [6]:
def tweak_kag(df):
    na_mask = df.Q9.isna()
    hide_mask = df.Q9.str.startswith('I do not').fillna(False)
    df = df[~na_mask & ~hide_mask]


    q1 = (df.Q1.replace({'Prefer not to say': 'Another',
                         'Prefer to self-describe': 'Another'})
               .rename('Gender'))
    
    q2 = df.Q2.str.slice(0,2).astype(int).rename('Age')
    
    def limit_countries(val):
        if val in  {'United States of America', 'India', 'China'}:
            return val
        return 'Another'
    q3 = df.Q3.apply(limit_countries).rename('Country')

    q4 = (df.Q4.replace({'Master’s degree': 18,
                         'Bachelor’s degree': 16,
                         'Doctoral degree': 20,
                         'Some college/university study without \
                          earning a bachelor’s degree': 13,
                         'Professional degree': 19,
                         'I prefer not to answer': None,
                         'No formal education past high school': 12})
               .fillna(11)
               .rename('Edu')
    )



    def only_cs_stat_val(val):
        if val not in {'cs', 'eng', 'stat'}:
            return 'another'
        return val
    q5 = (df.Q5
            .replace({
                'Computer science (software engineering, etc.)': 'cs',
                'Engineering (non-computer focused)': 'eng',
                'Mathematics or statistics': 'stat'})
             .apply(only_cs_stat_val)
             .rename('Studies'))
    
    def limit_occupation(val):
        if val in {'Student', 'Data Scientist', 'Software Engineer', 'Not employed',
                  'Data Engineer'}:
            return val
        return 'Another'
    q6 = df.Q6.apply(limit_occupation).rename('Occupation')

    q8 = (df.Q8
      .str.replace('+', '')
      .str.split('-', expand=True)
      .iloc[:,0]
      .fillna(-1)
      .astype(int)
      .rename('Experience')
    )


    q9 = (df.Q9
     .str.replace('+','')
     .str.replace(',','')
     .str.replace('500000', '500')
     .str.replace('I do not wish to disclose my approximate yearly compensation','')
     .str.split('-', expand=True)
     .iloc[:,0]
     .astype(int)
     .mul(1000)
     .rename('Salary'))
    return pd.concat([q1, q2, q3, q4, q5, q6, q8, q9], axis=1)

In [7]:
tweak_kag(df)

Unnamed: 0,Gender,Age,...,Experience,Salary
2,Male,30,...,5,10000
3,Female,30,...,0,0
5,Male,22,...,0,0
7,Male,35,...,10,10000
8,Male,18,...,0,0
...,...,...,...,...,...
23844,Male,30,...,10,90000
23845,Male,22,...,0,0
23854,Male,30,...,5,10000
23855,Male,45,...,5,250000


In [8]:
tweak_kag(df).dtypes

Gender        object
Age            int32
Country       object
Edu           object
Studies       object
Occupation    object
Experience     int32
Salary         int32
dtype: object

In [9]:
kag = tweak_kag(df)
(kag.groupby('Country')
    .apply(lambda g: g.Salary.corr(g.Experience)))

Country
Another                     0.289827
China                       0.252974
India                       0.167335
United States of America    0.354125
dtype: float64

## C.2 apply()方法的效能

In [10]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
        return val
     return 'Another'

In [11]:
q3 = df.Q3.apply(limit_countries)

In [12]:
%%timeit
q3 = df.Q3.apply(limit_countries)

6.07 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
other_values = df.Q3.value_counts().iloc[3:].index
q3_2 = df.Q3.replace(other_values, 'Another')

In [14]:
%%timeit
other_values = df.Q3.value_counts().iloc[3:].index
q3_2 = df.Q3.replace(other_values, 'Another')

35.7 ms ± 2.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
values = {'United States of America', 'India', 'China'}
q3_3 = df.Q3.where(df.Q3.isin(values), 'Another')

In [16]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_3 = df.Q3.where(df.Q3.isin(values), 'Another')

3.23 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
values = {'United States of America', 'India', 'China'}
q3_4 = pd.Series(np.where(df.Q3.isin(values), df.Q3, 'Another'), index=df.index)

In [18]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_4 = pd.Series(np.where(df.Q3.isin(values), df.Q3, 'Another'), index=df.index)

2.98 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
q3.equals(q3_2)

True

In [20]:
q3.equals(q3_3)

True

In [21]:
q3.equals(q3_4)

True

In [22]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
        return val
     return 'Another'

In [23]:
q3 = df.Q3.apply(limit_countries).rename('Country')

In [24]:
def debug(something):
    print(type(something), something)
    1/0

In [25]:
#q3.apply(debug)

In [26]:
the_item = None
def debug(something):
    global the_item
    the_item = something
    return something

_ = q3.apply(debug)
the_item

'Another'

## C.3 提高apply()的效能

In [27]:
!pip install pandarallel



In [28]:
# from pandarallel import pandarallel
# pandarallel.initialize()

In [29]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
        return val
     return 'Another'

In [30]:
# %%timeit
# res_p = df.Q3.parallel_apply(limit_countries).rename('Country')

In [31]:
!pip install swifter



In [32]:
import swifter

In [33]:
%%timeit
res_s = df.Q3.swifter.apply(limit_countries).rename('Country')

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/23859 [00:00<?, ?it/s]

89 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
!pip install dask



In [35]:
import dask

In [36]:
%%timeit
res_d = (dask.dataframe.from_pandas(df, npartitions=4)
                       .map_partitions(lambda df: df.Q3.apply(limit_countries))
                       .rename('Countries'))

701 ms ± 32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
np_fn = np.vectorize(limit_countries)

In [38]:
%%timeit
res_v = df.Q3.apply(np_fn).rename('Country')

460 ms ± 47.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
from numba import jit

In [40]:
@jit
def limit_countries2(val):
     if val in  ['United States of America', 'India', 'China']:
        return val
     return 'Another'

In [41]:
%%timeit
res_n = df.Q3.apply(limit_countries2).rename('Country')

87.4 ms ± 6.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## C.4 快速檢視程式碼的技巧

In [42]:
import zipfile
url = 'data/kaggle-survey-2018.zip'

with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [43]:
df.Q3.apply?

In [44]:
df.Q3.apply??

In [45]:
import pandas.core.series
pandas.core.series.lib

<module 'pandas._libs.lib' from 'C:\\Users\\Admin\\anaconda3\\lib\\site-packages\\pandas\\_libs\\lib.cp38-win_amd64.pyd'>

In [46]:
pandas.core.series.lib.map_infer??

## C.5 在Jupyter中除錯

In [47]:
import zipfile
url = 'data/kaggle-survey-2018.zip'

with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

In [48]:
def add1(x):
    return x + 1

#df.Q3.apply(add1)

In [49]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


In [51]:
from IPython.core.debugger import set_trace

def add1(x):
    set_trace()
    return x + 1

df.Q3.apply(add1)

> [1;32m<ipython-input-51-cb997d0cb281>[0m(5)[0;36madd1[1;34m()[0m
[1;32m      3 [1;33m[1;32mdef[0m [0madd1[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m----> 5 [1;33m    [1;32mreturn[0m [0mx[0m [1;33m+[0m [1;36m1[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      6 [1;33m[1;33m[0m[0m
[0m[1;32m      7 [1;33m[0mdf[0m[1;33m.[0m[0mQ3[0m[1;33m.[0m[0mapply[0m[1;33m([0m[0madd1[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> exit


BdbQuit: 

## C.4 以Great Expectations來管理資料完整性

In [52]:
kag = tweak_kag(df)

In [53]:
!pip install great_expectations



In [54]:
import great_expectations as ge
kag_ge = ge.from_pandas(kag)

In [55]:
sorted([x for x in set(dir(kag_ge)) - set(dir(kag))
    if not x.startswith('_')])

['add_citation',
 'append_expectation',
 'attempt_allowing_relative_error',
 'autoinspect',
 'batch_id',
 'batch_kwargs',
 'batch_markers',
 'batch_parameters',
 'column_aggregate_expectation',
 'column_map_expectation',
 'column_pair_map_expectation',
 'discard_failing_expectations',
 'edit_expectation_suite',
 'expect_column_bootstrapped_ks_test_p_value_to_be_greater_than',
 'expect_column_chisquare_test_p_value_to_be_greater_than',
 'expect_column_distinct_values_to_be_in_set',
 'expect_column_distinct_values_to_contain_set',
 'expect_column_distinct_values_to_equal_set',
 'expect_column_kl_divergence_to_be_less_than',
 'expect_column_max_to_be_between',
 'expect_column_mean_to_be_between',
 'expect_column_median_to_be_between',
 'expect_column_min_to_be_between',
 'expect_column_most_common_value_to_be_in_set',
 'expect_column_pair_cramers_phi_value_to_be_less_than',
 'expect_column_pair_values_A_to_be_greater_than_B',
 'expect_column_pair_values_to_be_equal',
 'expect_column_pair_

In [56]:
kag_ge.expect_column_to_exist('Salary')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {},
  "meta": {}
}

In [57]:
kag_ge.expect_column_mean_to_be_between(
   'Salary', min_value=10_000, max_value=100_000)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": 43869.66102793441,
    "element_count": 15429,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [58]:
kag_ge.expect_column_values_to_be_between(
   'Salary', min_value=0, max_value=500_000)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [59]:
kag_ge.expect_column_values_to_not_be_null('Salary')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 15429,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [60]:
kag_ge.expect_column_values_to_match_regex(
    'Country', r'America|India|Another|China')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [61]:
kag_ge.expect_column_values_to_be_of_type(
   'Salary', type_='int')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": "int32"
  },
  "meta": {}
}

In [62]:
kag_ge.save_expectation_suite('kaggle_expectations.json')

In [63]:
kag_ge.to_csv('kag.csv')
import json
ge.validate(ge.read_csv('kag.csv'), 
    expectation_suite=json.load(
        open('kaggle_expectations.json')))

{
  "results": [
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": true,
      "result": {},
      "meta": {},
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "Salary"
        },
        "expectation_type": "expect_column_to_exist"
      }
    },
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": true,
      "result": {
        "observed_value": 43869.66102793441,
        "element_count": 15429,
        "missing_count": null,
        "missing_percent": null
      },
      "meta": {},
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "Salary",
          "max_value": 100000,
          "min_value": 10000
        },
        "expectation_type": "expect_column_mean_to_be_between"
      }
