# Pandas Cookbook 3nd Edition by Packt

# Chapter 1: Pandas Foundations

In [4]:
import pandas as pd
import numpy as np

pd.set_option('max_columns', 4, 'max_rows', 10)

In [5]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## Intrduction

## Dissecting the anatomy of a DataFrame

In [3]:
movies = pd.read_csv('data/movie.csv')
movies.head()

Unnamed: 0,color,director_name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


## DataFrame Attributes

In [6]:
columns = movies.columns
index = movies.index
data = movies.values

In [13]:
columns, index, data, type(index), type(columns), issubclass(pd.RangeIndex, pd.Index)

(Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
        'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
        'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
        'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
        'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
        'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
        'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
        'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
       dtype='object'),
 RangeIndex(start=0, stop=4916, step=1),
 array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
        ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
        ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
        ...,
        ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
        ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
        ['Color', 'Jon Gunn', 43.0, ..

In [14]:
index.values, columns.values

(array([   0,    1,    2, ..., 4913, 4914, 4915], dtype=int64),
 array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
        'director_facebook_likes', 'actor_3_facebook_likes',
        'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
        'actor_1_name', 'movie_title', 'num_voted_users',
        'cast_total_facebook_likes', 'actor_3_name',
        'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
        'num_user_for_reviews', 'language', 'country', 'content_rating',
        'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
        'aspect_ratio', 'movie_facebook_likes'], dtype=object))

## Understanding data types

In [15]:
movies.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

In [20]:
pd.Series(['Paul', np.nan, 'George']).dtype, \
pd.Series([1, 2, 3]).dtype


(dtype('O'), dtype('int64'))

## Selecting a Column

In [22]:
movies['director_name'], \
movies.director_name

(0           James Cameron
 1          Gore Verbinski
 2              Sam Mendes
 3       Christopher Nolan
 4             Doug Walker
               ...        
 4911          Scott Smith
 4912                  NaN
 4913     Benjamin Roberds
 4914          Daniel Hsia
 4915             Jon Gunn
 Name: director_name, Length: 4916, dtype: object,
 0           James Cameron
 1          Gore Verbinski
 2              Sam Mendes
 3       Christopher Nolan
 4             Doug Walker
               ...        
 4911          Scott Smith
 4912                  NaN
 4913     Benjamin Roberds
 4914          Daniel Hsia
 4915             Jon Gunn
 Name: director_name, Length: 4916, dtype: object)

In [26]:
movies.loc[:, 'director_name'], movies.iloc[:, 1]

(0           James Cameron
 1          Gore Verbinski
 2              Sam Mendes
 3       Christopher Nolan
 4             Doug Walker
               ...        
 4911          Scott Smith
 4912                  NaN
 4913     Benjamin Roberds
 4914          Daniel Hsia
 4915             Jon Gunn
 Name: director_name, Length: 4916, dtype: object,
 0           James Cameron
 1          Gore Verbinski
 2              Sam Mendes
 3       Christopher Nolan
 4             Doug Walker
               ...        
 4911          Scott Smith
 4912                  NaN
 4913     Benjamin Roberds
 4914          Daniel Hsia
 4915             Jon Gunn
 Name: director_name, Length: 4916, dtype: object)

In [34]:
movies['director_name'].index, movies['director_name'].dtype, \
movies['director_name'].size, movies['director_name'].name,   \
type(movies['director_name']), movies['director_name'].apply(type).unique()

(RangeIndex(start=0, stop=4916, step=1),
 dtype('O'),
 4916,
 'director_name',
 pandas.core.series.Series,
 array([<class 'str'>, <class 'float'>], dtype=object))

## Calling Series Methods

In [47]:
s_attr_methods = set(dir(pd.Series)) 
len(s_attr_methods)

434

In [48]:
df_attr_methods = set(dir(pd.DataFrame)) 
len(df_attr_methods)

441

In [49]:
len(set(dir(pd.Series)) & set(dir(pd.DataFrame)))

384

In [50]:
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [52]:
director.dtype, fb_likes.dtype

(dtype('O'), dtype('float64'))

In [53]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [63]:
director.sample(n=5, random_state=42)

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [67]:
director.value_counts(), fb_likes.value_counts()

(Steven Spielberg    26
 Woody Allen         22
 Clint Eastwood      20
 Martin Scorsese     20
 Ridley Scott        16
                     ..
 Cody Cameron         1
 Sam Firstenberg      1
 Tay Garnett          1
 Andrea Arnold        1
 Yarrow Cheney        1
 Name: director_name, Length: 2397, dtype: int64,
 1000.0     436
 11000.0    206
 2000.0     189
 3000.0     150
 12000.0    131
           ... 
 564.0        1
 46000.0      1
 49.0         1
 447.0        1
 161.0        1
 Name: actor_1_facebook_likes, Length: 877, dtype: int64)

In [82]:
# count() - Null 값 제외 Count
director.size,  \
director.shape, \
len(director), \
director.unique(), \
director.count(), \
director.isna()

(4916,
 (4916,),
 4916,
 array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
        'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'], dtype=object),
 4814,
 0       False
 1       False
 2       False
 3       False
 4       False
         ...  
 4911    False
 4912     True
 4913    False
 4914    False
 4915    False
 Name: director_name, Length: 4916, dtype: bool)

In [81]:
# quantile() - default 0.5(median)
fb_likes.count(),    \
fb_likes.quantile(), \
fb_likes.min(),      \
fb_likes.max(),      \
fb_likes.mean(),     \
fb_likes.median(),   \
fb_likes.std(),      \
fb_likes.var(),      \
fb_likes.quantile(0.2), \
fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])


(4909,
 982.0,
 0.0,
 640000.0,
 6494.488490527602,
 982.0,
 15106.986883848309,
 228221052.70876482,
 510.0,
 0.1      240.0
 0.2      510.0
 0.3      694.0
 0.4      854.0
 0.5      982.0
 0.6     1000.0
 0.7     8000.0
 0.8    13000.0
 0.9    18000.0
 Name: actor_1_facebook_likes, dtype: float64)

In [89]:
fb_likes_filled = fb_likes.fillna(0)

len(fb_likes), \
fb_likes.count(), \
fb_likes.size, \
fb_likes_filled.count()

(4916, 4909, 4916, 4916)

In [90]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

4909

In [92]:
# normalize=True - 전체갯수를 대상 갯수로 나눈다.
director.value_counts(), \
director.value_counts(normalize=True)

(Steven Spielberg    26
 Woody Allen         22
 Clint Eastwood      20
 Martin Scorsese     20
 Ridley Scott        16
                     ..
 Cody Cameron         1
 Sam Firstenberg      1
 Tay Garnett          1
 Andrea Arnold        1
 Yarrow Cheney        1
 Name: director_name, Length: 2397, dtype: int64,
 Steven Spielberg    0.005401
 Woody Allen         0.004570
 Clint Eastwood      0.004155
 Martin Scorsese     0.004155
 Ridley Scott        0.003324
                       ...   
 Cody Cameron        0.000208
 Sam Firstenberg     0.000208
 Tay Garnett         0.000208
 Andrea Arnold       0.000208
 Yarrow Cheney       0.000208
 Name: director_name, Length: 2397, dtype: float64)

In [95]:
director.hasnans, \
director.notna(), \
director.isnull()

(True,
 0        True
 1        True
 2        True
 3        True
 4        True
         ...  
 4911     True
 4912    False
 4913     True
 4914     True
 4915     True
 Name: director_name, Length: 4916, dtype: bool,
 0       False
 1       False
 2       False
 3       False
 4       False
         ...  
 4911    False
 4912     True
 4913    False
 4914    False
 4915    False
 Name: director_name, Length: 4916, dtype: bool)

## Series Opeations

In [98]:
movies = pd.read_csv('data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [102]:
imdb_score + 1, \
imdb_score * 2.5, \
imdb_score // 7, \
imdb_score > 7

(0       8.9
 1       8.1
 2       7.8
 3       9.5
 4       8.1
        ... 
 4911    8.7
 4912    8.5
 4913    7.3
 4914    7.3
 4915    7.6
 Name: imdb_score, Length: 4916, dtype: float64,
 0       19.75
 1       17.75
 2       17.00
 3       21.25
 4       17.75
         ...  
 4911    19.25
 4912    18.75
 4913    15.75
 4914    15.75
 4915    16.50
 Name: imdb_score, Length: 4916, dtype: float64,
 0       1.0
 1       1.0
 2       0.0
 3       1.0
 4       1.0
        ... 
 4911    1.0
 4912    1.0
 4913    0.0
 4914    0.0
 4915    0.0
 Name: imdb_score, Length: 4916, dtype: float64,
 0        True
 1        True
 2       False
 3        True
 4        True
         ...  
 4911     True
 4912     True
 4913    False
 4914    False
 4915    False
 Name: imdb_score, Length: 4916, dtype: bool)

In [104]:
imdb_score.add(1), \
imdb_score.gt(7)

(0       8.9
 1       8.1
 2       7.8
 3       9.5
 4       8.1
        ... 
 4911    8.7
 4912    8.5
 4913    7.3
 4914    7.3
 4915    7.6
 Name: imdb_score, Length: 4916, dtype: float64,
 0        True
 1        True
 2       False
 3        True
 4        True
         ...  
 4911     True
 4912     True
 4913    False
 4914    False
 4915    False
 Name: imdb_score, Length: 4916, dtype: bool)

In [103]:
director = movies['director_name']
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## Chaining Series Methods

In [105]:
movies = pd.read_csv('data/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [107]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: director_name, dtype: int64

In [110]:
fb_likes.isna().sum(), \
fb_likes.dtype

(7, dtype('float64'))

In [111]:
fb_likes.fillna(0).astype(int).head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [112]:
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [113]:
fb_likes.isna().mean()

0.0014239218877135883

In [118]:
def debug_df(df):
    print('BEFORE')
    print(df)
    print('AFTER')
    df = df + 1
    return df

fb_likes.fillna(0).pipe(debug_df).astype(int).head()

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0     1001
1    40001
2    11001
3    27001
4      132
Name: actor_1_facebook_likes, dtype: int32

In [124]:
intermediate = None
def get_intermediate(df):
    global intermediate
    intermediate = df
    return df

res = fb_likes.fillna(0).pipe(get_intermediate).astype(int).head()
res, \
intermediate

(0     1000
 1    40000
 2    11000
 3    27000
 4      131
 Name: actor_1_facebook_likes, dtype: int32,
 0        1000.0
 1       40000.0
 2       11000.0
 3       27000.0
 4         131.0
          ...   
 4911      637.0
 4912      841.0
 4913        0.0
 4914      946.0
 4915       86.0
 Name: actor_1_facebook_likes, Length: 4916, dtype: float64)

## Renaming Column Names

In [143]:
movies = pd.read_csv('data/movie.csv')

In [127]:
col_map = {'director_name':'Director Name', 'num_critic_for_reviews':'Critical Reviews'}
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


In [129]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}

movies.set_index('movie_title').rename(index=idx_map, columns=col_map).head(3)

Unnamed: 0_level_0,color,director_name,...,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [134]:
movies = pd.read_csv('data/movie.csv', index_col='movie_title')
ids = movies.index.tolist()
columns = movies.columns.tolist()
ids[:3], columns[:3]


(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre'],
 ['color', 'director_name', 'num_critic_for_reviews'])

## rename the row and column labels with list assignments

In [136]:
ids[0] = 'Ravata'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[1] = 'fblikes'

movies.index = ids
movies.columns = columns

In [137]:
movies.head(3)

Unnamed: 0,color,fblikes,...,aspect,movie_facebook_likes
Ravata,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [139]:
def to_clean(val):
    return val.strip().lower().replace(' ', '_')

movies.rename(columns=to_clean).head(3)

Unnamed: 0,color,fblikes,...,aspect,movie_facebook_likes
Ravata,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [147]:
cols = [col.strip().upper().replace(' ', '_') for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0,COLOR,DIRECTOR_NAME,...,ASPECT_RATIO,MOVIE_FACEBOOK_LIKES
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000


## Creating and Deleting columns

In [152]:
movies = pd.read_csv('data/movie.csv')
# movies['has_seen'] = 0
movies.head(3)

Unnamed: 0,color,director_name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000


In [153]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}

movies.rename(index=idx_map, columns=col_map).assign(has_seen=0)

Unnamed: 0,color,director_name,...,fblikes,has_seen
0,Color,James Cameron,...,33000,0
1,Color,Gore Verbinski,...,0,0
2,Color,Sam Mendes,...,85000,0
3,Color,Christopher Nolan,...,164000,0
4,,Doug Walker,...,0,0
...,...,...,...,...,...
4911,Color,Scott Smith,...,84,0
4912,Color,,...,32000,0
4913,Color,Benjamin Roberds,...,16,0
4914,Color,Daniel Hsia,...,660,0


In [156]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])
total.head(5)         

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [168]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
    'actor_3_facebook_likes','director_facebook_likes']
sum_col  = movies[cols].sum(axis='columns')
sum_col.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4      274.0
dtype: float64

In [170]:
movies.assign(total_likes=sum_col).head(3)

Unnamed: 0,color,director_name,...,movie_facebook_likes,total_likes
0,Color,James Cameron,...,33000,2791.0
1,Color,Gore Verbinski,...,0,46563.0
2,Color,Sam Mendes,...,85000,11554.0


In [167]:
def sum_likes(df):
    return df[[c for c in df.columns if 'like' in c]].sum(axis=1)

movies.assign(total_likes=sum_likes).head(3)

Unnamed: 0,color,director_name,...,movie_facebook_likes,total_likes
0,Color,James Cameron,...,33000,40625.0
1,Color,Gore Verbinski,...,0,94913.0
2,Color,Sam Mendes,...,85000,108254.0


In [174]:
movies.assign(total_likes=sum_col)['total_likes'].isna().sum()

0

In [179]:
total
movies.assign(total_likes=total.fillna(0))['total_likes'].isna().sum()

0

In [186]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= df['total_likes']

df2 = movies.assign(total_likes=total, is_cast_likes_more=cast_like_gt_actor_director)
df2.head(3)

Unnamed: 0,color,director_name,...,total_likes,is_cast_likes_more
0,Color,James Cameron,...,2791.0,True
1,Color,Gore Verbinski,...,46563.0,True
2,Color,Sam Mendes,...,11554.0,True


In [198]:
df2
df2['is_cast_likes_more'].all()


False

In [206]:
df2[df2['is_cast_likes_more']==False][['cast_total_facebook_likes', 'total_likes', 'is_cast_likes_more']].count()

cast_total_facebook_likes    1065
total_likes                   943
is_cast_likes_more           1065
dtype: int64

In [207]:
df2 = df2.drop(columns='total_likes')
df2

Unnamed: 0,color,director_name,...,movie_facebook_likes,is_cast_likes_more
0,Color,James Cameron,...,33000,True
1,Color,Gore Verbinski,...,0,True
2,Color,Sam Mendes,...,85000,True
3,Color,Christopher Nolan,...,164000,True
4,,Doug Walker,...,0,False
...,...,...,...,...,...
4911,Color,Scott Smith,...,84,True
4912,Color,,...,32000,False
4913,Color,Benjamin Roberds,...,16,True
4914,Color,Daniel Hsia,...,660,True


In [214]:
actor_sum = movies[[c for c in movies.columns if 'actor_' in c and '_likes' in c]].sum(axis='columns')

movies['cast_total_facebook_likes'] >= actor_sum
movies['cast_total_facebook_likes'].ge(actor_sum)
movies['cast_total_facebook_likes'].ge(actor_sum).all()

True

In [215]:
pct_like = actor_sum.div(movies['cast_total_facebook_likes'])

In [216]:
pct_like.describe()

count    4883.000000
mean        0.833279
std         0.140566
min         0.300767
25%         0.735284
50%         0.869289
75%         0.954774
max         1.000000
dtype: float64

In [218]:
pd.Series(pct_like.values, index=movies['movie_title'].values).head()

Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    1.000000
dtype: float64

In [225]:
profit_index = movies.columns.get_loc('gross') + 1
profit_index

9

In [226]:
movies.insert(loc=profit_index, column='profit', value=movies['gross'] - movies['budget'])

In [228]:
del movies['director_name']