# Chapter 1: Pandas Foundations

In [4]:
import pandas as pd
import numpy as np

## Introduction

## Dissecting the anatomy of a DataFrame

In [5]:
import os
print(os.getcwd())

pd.set_option('max_columns', 4, 'max_rows', 10)

D:\Code\python\master\pandas-cook-2nd\master\Chapter01


In [6]:
movies = pd.read_csv('../data/movie.csv')
movies.head()

Unnamed: 0,color,director_name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


### How it works...

## DataFrame Attributes

### How to do it... {#how-to-do-it-1}

In [9]:
movies = pd.read_csv('../data/movie.csv')
columns = movies.columns
index = movies.index
data = movies.values

In [10]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [11]:
index

RangeIndex(start=0, stop=4916, step=1)

In [12]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [13]:
type(index)

pandas.core.indexes.range.RangeIndex

In [14]:
type(columns)

pandas.core.indexes.base.Index

In [15]:
type(data)

numpy.ndarray

In [16]:
issubclass(pd.RangeIndex, pd.Index)

True

### How it works...

### There's more

In [None]:
index.values

In [None]:
columns.values

## Understanding data types

### How to do it... {#how-to-do-it-2}

In [None]:
movies = pd.read_csv('data/movie.csv')

In [None]:
movies.dtypes

In [None]:
movies.get_dtype_counts()

In [None]:
movies.info()

### How it works...

In [None]:
pd.Series(['Paul', np.nan, 'George']).dtype

### There's more...

### See also

## Selecting a Column

### How to do it... {#how-to-do-it-3}

In [None]:
movies = pd.read_csv('data/movie.csv')
movies['director_name']

In [None]:
movies.director_name

In [None]:
movies.loc[:, 'director_name']

In [None]:
movies.iloc[:, 1]

In [None]:
movies['director_name'].index

In [None]:
movies['director_name'].dtype

In [None]:
movies['director_name'].size

In [None]:
movies['director_name'].name

In [None]:
type(movies['director_name'])

In [None]:
movies['director_name'].apply(type).unique()

### How it works...

### There's more

### See also

## Calling Series Methods

In [None]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

In [None]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

In [None]:
len(s_attr_methods & df_attr_methods)

### How to do it... {#how-to-do-it-4}

In [None]:
movies = pd.read_csv('data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [None]:
director.dtype

In [None]:
fb_likes.dtype

In [None]:
director.head()

In [None]:
director.sample(n=5, random_state=42)

In [None]:
fb_likes.head()

In [None]:
director.value_counts()

In [None]:
fb_likes.value_counts()

In [None]:
director.size

In [None]:
director.shape

In [None]:
len(director)

In [None]:
director.unique()

In [None]:
director.count()

In [None]:
fb_likes.count()

In [None]:
fb_likes.quantile()

In [None]:
fb_likes.min()

In [None]:
fb_likes.max()

In [None]:
fb_likes.mean()

In [None]:
fb_likes.median()

In [None]:
fb_likes.std()

In [None]:
fb_likes.describe()

In [None]:
director.describe()

In [None]:
fb_likes.quantile(.2)

In [None]:
fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

In [None]:
director.isna()

In [None]:
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

In [None]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

### How it works...

### There's more...

In [None]:
director.value_counts(normalize=True)

In [None]:
director.hasnans

In [None]:
director.notna()

### See also

## Series Operations

In [None]:
5 + 9    # plus operator example. Adds 5 and 9

### How to do it... {#how-to-do-it-5}

In [None]:
movies = pd.read_csv('data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

In [None]:
imdb_score + 1

In [None]:
imdb_score * 2.5

In [None]:
imdb_score // 7

In [None]:
imdb_score > 7

In [None]:
director = movies['director_name']
director == 'James Cameron'

### How it works...

### There's more...

In [None]:
imdb_score.add(1)   # imdb_score + 1

In [None]:
imdb_score.gt(7)   # imdb_score > 7

### See also

## Chaining Series Methods

### How to do it... {#how-to-do-it-6}

In [None]:
movies = pd.read_csv('data/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [None]:
director.value_counts().head(3)

In [None]:
fb_likes.isna().sum()

In [None]:
fb_likes.dtype

In [None]:
(fb_likes.fillna(0)
         .astype(int)
         .head()
)

### How it works...

### There's more...

In [None]:
(fb_likes.fillna(0)
         #.astype(int)
         #.head()
)

In [None]:
(fb_likes.fillna(0)
         .astype(int)
         #.head()
)

In [None]:
fb_likes.isna().mean()

In [None]:
fb_likes.fillna(0) \
        .astype(int) \
        .head()

In [None]:
def debug_df(df):
    print("BEFORE")
    print(df)
    print("AFTER")
    return df

In [None]:
(fb_likes.fillna(0)
         .pipe(debug_df)
         .astype(int) 
         .head()
)

In [None]:
intermediate = None
def get_intermediate(df):
    global intermediate
    intermediate = df
    return df

In [None]:
res = (fb_likes.fillna(0)
         .pipe(get_intermediate)
         .astype(int) 
         .head()
)

In [None]:
intermediate

## Renaming Column Names

### How to do it...

In [None]:
movies = pd.read_csv('data/movie.csv')

In [None]:
col_map = {'director_name':'Director Name', 
             'num_critic_for_reviews': 'Critical Reviews'} 

In [None]:
movies.rename(columns=col_map).head()

### How it works... {#how-it-works-8}

### There's more {#theres-more-7}

In [None]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .set_index('movie_title')
   .rename(index=idx_map, columns=col_map)
   .head(3)
)

In [None]:
movies = pd.read_csv('data/movie.csv', index_col='movie_title')
ids = movies.index.tolist()
columns = movies.columns.tolist()

# rename the row and column labels with list assignments

In [None]:
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'
movies.index = ids
movies.columns = columns

In [None]:
movies.head(3)

In [None]:
def to_clean(val):
    return val.strip().lower().replace(' ', '_')

In [None]:
movies.rename(columns=to_clean).head(3)

In [None]:
cols = [col.strip().lower().replace(' ', '_')
        for col in movies.columns]
movies.columns = cols
movies.head(3)

## Creating and Deleting columns

### How to do it... {#how-to-do-it-9}

In [None]:
movies = pd.read_csv('data/movie.csv')
movies['has_seen'] = 0

In [None]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .rename(index=idx_map, columns=col_map)
   .assign(has_seen=0)
)

In [None]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])

In [None]:
total.head(5)

In [None]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
    'actor_3_facebook_likes','director_facebook_likes']
sum_col = movies[cols].sum(axis='columns')
sum_col.head(5)

In [None]:
movies.assign(total_likes=sum_col).head(5)

In [None]:
def sum_likes(df):
   return df[[c for c in df.columns
              if 'like' in c]].sum(axis=1)

In [None]:
movies.assign(total_likes=sum_likes).head(5)

In [None]:
(movies
   .assign(total_likes=sum_col)
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
(movies
   .assign(total_likes=total)
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
(movies
   .assign(total_likes=total.fillna(0))
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= \
           df['total_likes']

In [None]:
df2 = (movies
   .assign(total_likes=total,
           is_cast_likes_more = cast_like_gt_actor_director)
)

In [None]:
df2['is_cast_likes_more'].all()

In [None]:
df2 = df2.drop(columns='total_likes')

In [None]:
actor_sum = (movies
   [[c for c in movies.columns if 'actor_' in c and '_likes' in c]]
   .sum(axis='columns')
)

In [None]:
actor_sum.head(5)

In [None]:
movies['cast_total_facebook_likes'] >= actor_sum

In [None]:
movies['cast_total_facebook_likes'].ge(actor_sum)

In [None]:
movies['cast_total_facebook_likes'].ge(actor_sum).all()

In [None]:
pct_like = (actor_sum
    .div(movies['cast_total_facebook_likes'])
)

In [None]:
pct_like.describe()

In [None]:
pd.Series(pct_like.values,
    index=movies['movie_title'].values).head()

### How it works... {#how-it-works-9}

### There's more... {#theres-more-8}

In [None]:
profit_index = movies.columns.get_loc('gross') + 1
profit_index

In [None]:
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])

In [None]:
del movies['director_name']

### See also