#Dictionaries in Python

📙 A Python dictionary is an ordered collection of `key:value` pairs. They're very similar to an ordinary dictionary in which the key is 'Apple' and the value is it's definition 'the round fruit of a tree of the rose family, which typically has thin green or red skin and crisp flesh'. Python dictionaries are useful as they allow us to store and easily access important data.

## Data Science Example

Below I give an example of how I use dictionaries everyday as a Data Scientist; that is, using dictionaries to store a model's hyperparameters.

In [76]:
# import the data
import pandas as pd

df = pd.read_csv('Breast_cancer_data.csv')
df.shape

(569, 6)

In [77]:
# get some quick, high-level information about the data
print('Name , type , unique levels , missing count ')
print('')
for col in df.columns:
  print(col, ',', df[col].dtype, ',', df[col].nunique(dropna=False), ',', df[col].isna().sum())

Name , type , unique levels , missing count 

mean_radius , float64 , 456 , 0
mean_texture , float64 , 479 , 0
mean_perimeter , float64 , 522 , 0
mean_area , float64 , 539 , 0
mean_smoothness , float64 , 474 , 0
diagnosis , int64 , 2 , 0


In [78]:
# Check distribution of dependent variable
df['diagnosis'].value_counts(dropna=False)

diagnosis
1    357
0    212
Name: count, dtype: int64

In [79]:
# Split data into train and test
from sklearn.model_selection import train_test_split

df_x = df.drop('diagnosis', axis=1)
df_y = df['diagnosis']

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.3, random_state = 37)

In [80]:
# train model
import lightgbm as lgb

m = lgb.LGBMClassifier(random_state=37, verbose=-1)
m.fit(x_train, y_train)

# predict the results
y_pred = m.predict(x_test)

In [81]:
# check some performance metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

cm = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', cm)

LightGBM Model accuracy score: 0.9181

Confusion matrix:
 [[ 49  10]
 [  4 108]]


In [82]:
# access the model's hyperparameters - they're stored in a Python dictionary
hyperparameters = m.get_params(deep=True)
hyperparameters

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 37,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbose': -1}

In [83]:
# accessing certain keys in the hyperparameter dictionary
key = 'num_leaves'
print(f'Key: {key}')
print(f'Value: {hyperparameters[key]}')

Key: num_leaves
Value: 31


In [84]:
# writing the hyperparameter dictionary to a yaml file
import yaml

with open('hyperparameters.yaml', 'w') as outfile:
    yaml.dump(hyperparameters, outfile, default_flow_style=False)

In [85]:
# reading yaml file
with open("hyperparameters.yaml", 'r') as hp_yaml:
    hyp_params = yaml.safe_load(hp_yaml)

In [86]:
# make an update to the hyperparameters
hyp_params2 = { **hyp_params, 'boosting_type': 'goss', 'random_state': '38'}
hyp_params2

{'boosting_type': 'goss',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': '38',
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbose': -1}

In [87]:
# training a model using these hyperparameters
m2 = lgb.LGBMClassifier(**hyp_params2)
m2.fit(x_train, y_train)

# predict the results
y_pred2 = m2.predict(x_test)

accuracy = accuracy_score(y_pred2, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred2)))

cm = confusion_matrix(y_test, y_pred2)
print('\nConfusion matrix:\n', cm)

hyperparameters2 = m2.get_params(deep=True)
print('\nhyperparameters:')
hyperparameters2

LightGBM Model accuracy score: 0.9298

Confusion matrix:
 [[ 51   8]
 [  4 108]]

hyperparameters:


{'boosting_type': 'goss',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': '38',
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbose': -1}

## Other Dictionary Hacks

In [88]:
# creating a dictionary from scratch

my_dict = dict(model1 = dict(boosting_type = 'goss',
                  lambda_11 = 0.1,
                  lambda_12 = 0,
                  learning_rate = 0.1,
                  max_depth = 4,
                  metric = 'poisson',
                  min_data_in_leaf = 5000,
                  min_split_gain = 0,
                  n_estimators = 500,
                  num_leaves = 100,
                  objective = 'poisson',
                  verbose = -1),
            model2 = dict(boosting_type = 'goss',
                  lambda_11 = 0.2,
                  lambda_12 = 0,
                  learning_rate = 0.12,
                  max_depth = 3,
                  metric = 'gamma',
                  min_data_in_leaf = 400,
                  min_split_gain = 0,
                  n_estimators = 300,
                  num_leaves = 100,
                  objective = 'gamma',
                  verbose = -1))

In [89]:
# access a value
print('model 1 boosting_type:', my_dict['model1']['boosting_type'])

model 1 boosting_type: goss


In [90]:
# modify value
my_dict['model2']['n_estimators'] = 350
print('model 2 n_estimators:', my_dict['model2']['n_estimators'])

model 2 n_estimators: 350


In [91]:
# Delete key-value pair
del my_dict['model1']['verbose']
my_dict['model1']

{'boosting_type': 'goss',
 'lambda_11': 0.1,
 'lambda_12': 0,
 'learning_rate': 0.1,
 'max_depth': 4,
 'metric': 'poisson',
 'min_data_in_leaf': 5000,
 'min_split_gain': 0,
 'n_estimators': 500,
 'num_leaves': 100,
 'objective': 'poisson'}

In [92]:
# Iterate through keys and values
for key, value in my_dict['model1'].items():
    print(key, ":", value)

boosting_type : goss
lambda_11 : 0.1
lambda_12 : 0
learning_rate : 0.1
max_depth : 4
metric : poisson
min_data_in_leaf : 5000
min_split_gain : 0
n_estimators : 500
num_leaves : 100
objective : poisson


In [93]:
# Check if a key exists
if 'verbose' in my_dict['model1']:
    print("verbose is present")
else:
    print("verbose is not present")

verbose is not present


In [94]:
# Merge dictionaries
add_dict = { 'model3': {'boosting type': 'gbdt', 'n_estimators': 100},
             'model4': {'boosting type': 'gbdt', 'n_estimators': 500}}
my_dict.update(add_dict)

In [98]:
# Length of dictionary
print("Number of models:", len(my_dict))
for i in range(1,len(my_dict)+1):
  mod = f'model{i}'
  print(f'Number of hyperparameters in {mod}: {len(my_dict[mod])}')

Number of models: 4
Number of hyperparameters in model1: 11
Number of hyperparameters in model2: 12
Number of hyperparameters in model3: 2
Number of hyperparameters in model4: 2


In [100]:
# Clear the dictionary
my_dict.clear()
my_dict

{}