# Pandas Features

In [1]:
import pandas as pd

## Pandas Series

### Pandas Attributes

In [2]:
# define a list of values
sales_list = [107512, 103208, 99388, 103838, 104631]

# create a Pandas series
sales_series = pd.Series(sales_list)
sales_series

0    107512
1    103208
2     99388
3    103838
4    104631
dtype: int64

In [3]:
# The default indexing starts from zero
print(sales_series.index)
new_index = ['The Kissing Booth', 
            'Between Worlds', 
            'Sicario: Day of the Soldado', 
            'Spider-Man: Into the Spider-Verse', 
            'Ant-Man and the Wasp']

sales_series.index  = new_index
print(sales_series['Ant-Man and the Wasp'])


# Retrieve the values of the series
print(sales_series.values)

RangeIndex(start=0, stop=5, step=1)
104631
[107512 103208  99388 103838 104631]


### Series Methods (Computations)

In [4]:
#Average
print(sales_series.mean())

#Total sum
print(sales_series.sum())

103715.4
518577


In [5]:
# Create a series from a python dict
sales_dict = {'Dragon Ball Super: Origin of the Saiyans': 105982,
              'Animal World': 108293,
              'Avengers: Infinity War': 112178,
              'A Quiet Place': 103813,
              'Bumblebee': 106562}

sales_series_dict = pd.Series(sales_dict)
print(sales_series_dict)

Dragon Ball Super: Origin of the Saiyans    105982
Animal World                                108293
Avengers: Infinity War                      112178
A Quiet Place                               103813
Bumblebee                                   106562
dtype: int64


### Concatenate

In [6]:
# Vertically concatenate two series
sales_series = pd.concat([sales_series, sales_series_dict], axis='rows')
print(sales_series)

The Kissing Booth                           107512
Between Worlds                              103208
Sicario: Day of the Soldado                  99388
Spider-Man: Into the Spider-Verse           103838
Ant-Man and the Wasp                        104631
Dragon Ball Super: Origin of the Saiyans    105982
Animal World                                108293
Avengers: Infinity War                      112178
A Quiet Place                               103813
Bumblebee                                   106562
dtype: int64


### Loc and iloc (indicies)

In [7]:
# Slicing the series using a boolean array operation 
sales_series.loc[sales_series < 100000]

Sicario: Day of the Soldado    99388
dtype: int64

In [8]:
# Slicing the series using index range
sales_series.loc['Ant-Man and the Wasp':'A Quiet Place']

Ant-Man and the Wasp                        104631
Dragon Ball Super: Origin of the Saiyans    105982
Animal World                                108293
Avengers: Infinity War                      112178
A Quiet Place                               103813
dtype: int64

In [9]:
# Slicing the series using iloc
sales_series.iloc[0:5]

The Kissing Booth                    107512
Between Worlds                       103208
Sicario: Day of the Soldado           99388
Spider-Man: Into the Spider-Verse    103838
Ant-Man and the Wasp                 104631
dtype: int64

In [10]:
# Defining the column name
sales_series.name = 'Total tickets sold'

# Defining the name of the index
sales_series.index.name = 'Movie Name'

print(sales_series)

Movie Name
The Kissing Booth                           107512
Between Worlds                              103208
Sicario: Day of the Soldado                  99388
Spider-Man: Into the Spider-Verse           103838
Ant-Man and the Wasp                        104631
Dragon Ball Super: Origin of the Saiyans    105982
Animal World                                108293
Avengers: Infinity War                      112178
A Quiet Place                               103813
Bumblebee                                   106562
Name: Total tickets sold, dtype: int64


## Pandas Dataframe

In [11]:
tickets_sold_dict =  {'The Kissing Booth': 107512,
                        'Between Worlds': 103208,
                        'Sicario: Day of the Soldado': 99388,
                        'Spider-Man: Into the Spider-Verse': 103838,
                        'Ant-Man and the Wasp': 104631, 
                        'Dragon Ball Super: Origin of the Saiyans': 105982,
                        'Animal World': 108293,
                        'Avengers: Infinity War': 112178,
                        'A Quiet Place': 103813,
                        'Bumblebee': 106562}

tickets_sold = pd.Series(tickets_sold_dict)

In [12]:
max_capacity_dict = {'A Quiet Place': 427725,
                      'Animal World': 427300,
                      'Ant-Man and the Wasp': 429350,
                      'Avengers: Infinity War': 424325,
                      'Between Worlds': 423375,
                      'Bumblebee': 427950,
                      'Dragon Ball Super: Origin of the Saiyans': 423225,
                      'Sicario: Day of the Soldado': 427950,
                      'Spider-Man: Into the Spider-Verse': 428375,
                      'The Kissing Booth': 418750}

max_capacity = pd.Series(max_capacity_dict)

In [13]:
# create a DataFrame object from the series objects
sales_df = pd.DataFrame({'tickets_sold': tickets_sold, 
                         'max_capacity': max_capacity})
sales_df.head()

Unnamed: 0,tickets_sold,max_capacity
A Quiet Place,103813,427725
Animal World,108293,427300
Ant-Man and the Wasp,104631,429350
Avengers: Infinity War,112178,424325
Between Worlds,103208,423375


In [14]:
# access a specific column (like dict[key])
sales_df['tickets_sold'].head()

A Quiet Place             103813
Animal World              108293
Ant-Man and the Wasp      104631
Avengers: Infinity War    112178
Between Worlds            103208
Name: tickets_sold, dtype: int64

In [15]:
# find movies which did not sell 100k tickets
sales_df.loc[sales_df['tickets_sold'] < 100000]

Unnamed: 0,tickets_sold,max_capacity
Sicario: Day of the Soldado,99388,427950


### Reading and saving CSVs

In [16]:
# create a DataFrame from a csv file
total_sales = pd.read_csv('booking_summary.csv')

# save a DataFrame as a csv file
total_sales.tail(10).to_csv('last_ten.csv')

#Note: To ensure that Pandas do not need to save the index column, you can add the `index=False` to the `.to_csv()` method.
total_sales.tail(10).to_csv('last_ten.csv', index=False)

### Column count


In [17]:
total_sales['classification'].value_counts()

M        15
PG        8
MA15+     5
G         2
Name: classification, dtype: int64

### Create new column based on other columns

In [18]:
total_sales['occupancy_rate'] = round(total_sales['tickets_sold'] / total_sales['max_capacity'], 2)
total_sales.head()

Unnamed: 0,movie_name,classification,tickets_sold,max_capacity,occupancy_rate
0,A Quiet Place,M,103813,427725,0.24
1,Alpha,PG,103596,422525,0.25
2,An Interview with God,PG,104182,426575,0.24
3,Animal World,G,108293,427300,0.25
4,Ant-Man and the Wasp,PG,104631,429350,0.24


### Sorting

In [19]:
total_sales.sort_values('occupancy_rate', ascending=False).head(1)

Unnamed: 0,movie_name,classification,tickets_sold,max_capacity,occupancy_rate
29,Venom,M,110053,424200,0.26


### Filtering

In [20]:
total_sales[total_sales['classification'] == 'PG']
total_sales.head()

Unnamed: 0,movie_name,classification,tickets_sold,max_capacity,occupancy_rate
0,A Quiet Place,M,103813,427725,0.24
1,Alpha,PG,103596,422525,0.25
2,An Interview with God,PG,104182,426575,0.24
3,Animal World,G,108293,427300,0.25
4,Ant-Man and the Wasp,PG,104631,429350,0.24


### Sort over multiple columns

In [21]:
total_sales.sort_values(['classification', 'occupancy_rate', 'tickets_sold'],
                       ascending=[True, False, False]).drop(['max_capacity'], axis='columns')
total_sales.head()

Unnamed: 0,movie_name,classification,tickets_sold,max_capacity,occupancy_rate
0,A Quiet Place,M,103813,427725,0.24
1,Alpha,PG,103596,422525,0.25
2,An Interview with God,PG,104182,426575,0.24
3,Animal World,G,108293,427300,0.25
4,Ant-Man and the Wasp,PG,104631,429350,0.24


### Group by

In [22]:
total_sales.groupby('classification')['tickets_sold'].sum()

classification
G         215910
M        1578844
MA15+     521932
PG        834136
Name: tickets_sold, dtype: int64

In [23]:
total_sales.groupby('classification').size()

classification
G         2
M        15
MA15+     5
PG        8
dtype: int64

In [24]:
total_sales.groupby('classification').mean()

Unnamed: 0_level_0,tickets_sold,max_capacity,occupancy_rate
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G,107955.0,425612.5,0.25
M,105256.266667,424893.333333,0.248
MA15+,104386.4,424120.0,0.246
PG,104267.0,426875.0,0.2425


In [25]:
total_sales.groupby('classification').agg({'max_capacity': 'max', 'tickets_sold': 'mean', 'occupancy_rate': 'mean'})

Unnamed: 0_level_0,max_capacity,tickets_sold,occupancy_rate
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G,427300,107955.0,0.25
M,432225,105256.266667,0.248
MA15+,427950,104386.4,0.246
PG,430400,104267.0,0.2425
