## Most Usefull shortcuts in Jupiter Notebook

- shift + enter -> run current cell
- esc + m -> turn cell into markdown
- esc + y -> turn cell into code
- esc + a -> create cell above current
- esc + b -> create cell below current
- esc + dd -> delete current cell 

In [2]:
import pandas as pd

In [3]:
pd.__version__

'2.0.1'

In [4]:
# main objects attributes and methods

print("Series: ", len(dir(pd.Series)))

print("DataFrame: ", len(dir(pd.DataFrame)))

Series:  411
DataFrame:  427


### Documentation

[Series](https://pandas.pydata.org/docs/reference/series.html)

[DataFrame](https://pandas.pydata.org/docs/reference/frame.html)

In [5]:
# https://pandas.pydata.org/docs/getting_started/index.html#getting-started

data ={
        "Name": [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. William Henry",
            "Bonnell, Miss. Elizabeth",
        ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }

    

In [7]:
# create dataframe

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22,male
1,"Allen, Mr. William Henry",35,male
2,"Bonnell, Miss. Elizabeth",58,female


In [8]:
# show columns

df.columns

Index(['Name', 'Age', 'Sex'], dtype='object')

In [10]:
# select columns "Name"

df["Name"]

# check object type

type(df["Name"])


pandas.core.series.Series

In [12]:
# create a series

numbers = pd.Series([1,2,3])

# assign to DataFrame

df["Numbers"] = numbers


In [13]:
# show df

df

Unnamed: 0,Name,Age,Sex,Numbers
0,"Braund, Mr. Owen Harris",22,male,1
1,"Allen, Mr. William Henry",35,male,2
2,"Bonnell, Miss. Elizabeth",58,female,3


In [14]:
# import data csv

netflix = pd.read_csv("netflix.csv")

In [15]:
# columns

netflix.columns

Index(['As of', 'Rank', 'Year to Date Rank', 'Last Week Rank', 'Title', 'Type',
       'Netflix Exclusive', 'Netflix Release Date', 'Days In Top 10',
       'Viewership Score'],
      dtype='object')

In [16]:
# shape

netflix.shape

(7100, 10)

In [18]:
# head

netflix.head(2)

Unnamed: 0,As of,Rank,Year to Date Rank,Last Week Rank,Title,Type,Netflix Exclusive,Netflix Release Date,Days In Top 10,Viewership Score
0,2020-04-01,1,1,1,"Tiger King: Murder, Mayhem …",TV Show,Yes,"Mar 20, 2020",9,90
1,2020-04-01,2,2,-,Ozark,TV Show,Yes,"Jul 21, 2017",5,45


In [19]:
# describe

netflix.describe()

Unnamed: 0,Rank,Days In Top 10,Viewership Score
count,7100.0,7100.0,7100.0
mean,5.5,24.123662,122.790141
std,2.872484,58.473789,213.861642
min,1.0,1.0,1.0
25%,3.0,3.0,19.0
50%,5.5,7.0,50.0
75%,8.0,18.0,128.0
max,10.0,428.0,1474.0


In [20]:
# data types

netflix.dtypes

As of                   object
Rank                     int64
Year to Date Rank       object
Last Week Rank          object
Title                   object
Type                    object
Netflix Exclusive       object
Netflix Release Date    object
Days In Top 10           int64
Viewership Score         int64
dtype: object

In [21]:
# info

netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7100 entries, 0 to 7099
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   As of                 7100 non-null   object
 1   Rank                  7100 non-null   int64 
 2   Year to Date Rank     7100 non-null   object
 3   Last Week Rank        7100 non-null   object
 4   Title                 7100 non-null   object
 5   Type                  7100 non-null   object
 6   Netflix Exclusive     4599 non-null   object
 7   Netflix Release Date  7100 non-null   object
 8   Days In Top 10        7100 non-null   int64 
 9   Viewership Score      7100 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 554.8+ KB


In [23]:
# max "As of"

netflix["As of"].max()


'2022-03-11'

In [24]:
# value_counts "Title"

netflix["Title"].value_counts()


Title
Cocomelon                       428
Ozark                            85
Cobra Kai                        81
Manifest                         80
The Queenâs Gambit             73
                               ... 
The Office                        1
Animals on the Loose: A You…      1
Dark                              1
The Secret Life of Pets 2         1
Step Up Revolution                1
Name: count, Length: 645, dtype: int64

In [27]:
# boolean filter or masking

netflix[netflix["Rank"] == 1]["Title"].value_counts()

# netflix["Rank"] == 1

# select column "Title"

# value_counts



Title
Cobra Kai                       27
Squid Game                      23
Virgin River                    20
Manifest                        19
Outer Banks                     19
                                ..
Hillbilly Elegy                  1
Fear City: New York vs. The…     1
Uncut Gems                       1
The Willoughbys                  1
The Last Kingdom                 1
Name: count, Length: 123, dtype: int64

In [28]:
# sort_values by

netflix.sort_values(by="Title")

Unnamed: 0,As of,Rank,Year to Date Rank,Last Week Rank,Title,Type,Netflix Exclusive,Netflix Release Date,Days In Top 10,Viewership Score
1655,2020-09-13,6,6,-,#Alive,Movie,Yes,"Sep 8, 2020",5,31
1645,2020-09-12,6,5,-,#Alive,Movie,Yes,"Sep 8, 2020",4,26
1613,2020-09-09,4,-,-,#Alive,Movie,Yes,"Sep 8, 2020",1,7
1634,2020-09-11,5,3,-,#Alive,Movie,Yes,"Sep 8, 2020",3,21
1667,2020-09-14,8,6,-,#Alive,Movie,Yes,"Sep 8, 2020",6,34
...,...,...,...,...,...,...,...,...,...,...
4465,2021-06-21,6,6,-,Ãlite,TV Show,Yes,"Oct 5, 2018",3,13
4474,2021-06-22,5,6,-,Ãlite,TV Show,Yes,"Oct 5, 2018",4,19
4447,2021-06-19,8,-,-,Ãlite,TV Show,Yes,"Oct 5, 2018",1,3
4455,2021-06-20,6,8,-,Ãlite,TV Show,Yes,"Oct 5, 2018",2,8


In [34]:
# iterate columns and remove spaces

netflix.columns = [ '_'.join(col.split()) for col in netflix.columns]

netflix.columns


Index(['As_of', 'Rank', 'Year_to_Date_Rank', 'Last_Week_Rank', 'Title', 'Type',
       'Netflix_Exclusive', 'Netflix_Release_Date', 'Days_In_Top_10',
       'Viewership_Score'],
      dtype='object')

In [37]:
# apply method to a series

netflix["Days_In_Top_10"].apply(lambda x: x /2 )

# why should be cautious to use it


%timeit netflix["Days_In_Top_10"].apply(lambda x: x /2 )


841 µs ± 49.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [38]:
# vectorise is faster

%timeit netflix["Days_In_Top_10"] / 2


56.6 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [39]:
# do the maths of how fast

56.6 / 841

0.06730083234244946

In [46]:
# always look to an existing more efficient method before create a function

netflix.As_of.apply(lambda x: int(x[:4]))

# slice "As_of" column last 4 digits

pd.to_datetime(netflix.As_of).dt.month

# cast to_datetime use the dt.year function



0       4
1       4
2       4
3       4
4       4
       ..
7095    3
7096    3
7097    3
7098    3
7099    3
Name: As_of, Length: 7100, dtype: int32

In [47]:
# slicing data with loc

netflix.loc[ 0:10 , "As_of":"Title"]

Unnamed: 0,As_of,Rank,Year_to_Date_Rank,Last_Week_Rank,Title
0,2020-04-01,1,1,1,"Tiger King: Murder, Mayhem …"
1,2020-04-01,2,2,-,Ozark
2,2020-04-01,3,3,2,All American
3,2020-04-01,4,4,-,Blood Father
4,2020-04-01,5,5,4,The Platform
5,2020-04-01,6,6,-,Car Masters: Rust to Riches
6,2020-04-01,7,10,-,Unorthodox
7,2020-04-01,8,7,5,Love is Blind
8,2020-04-01,9,8,-,Badland
9,2020-04-01,10,9,-,Uncorked


In [49]:
# slicing data with iloc

netflix.iloc[ :20 , 3:7]

Unnamed: 0,Last_Week_Rank,Title,Type,Netflix_Exclusive
0,1,"Tiger King: Murder, Mayhem …",TV Show,Yes
1,-,Ozark,TV Show,Yes
2,2,All American,TV Show,
3,-,Blood Father,Movie,
4,4,The Platform,Movie,Yes
5,-,Car Masters: Rust to Riches,TV Show,Yes
6,-,Unorthodox,TV Show,Yes
7,5,Love is Blind,TV Show,Yes
8,-,Badland,Movie,
9,-,Uncorked,Movie,Yes


In [52]:
# use chaining and wrap in parentesis for readability

(
  netflix.iloc[ :20 , 3:7]
    ["Title"]
    .value_counts()

)

Title
Tiger King: Murder, Mayhem …    2
Ozark                           2
All American                    2
Blood Father                    2
The Platform                    2
Car Masters: Rust to Riches     2
Unorthodox                      2
Love is Blind                   1
Badland                         1
Uncorked                        1
Nailed It!                      1
How to Fix a Drug Scandal       1
The Roommate                    1
Name: count, dtype: int64

In [57]:
# Pivot table (works like in excel)

(netflix.pivot_table(
    index="Title",
    values="Rank",
    aggfunc="mean"
)
.sort_values(by="Rank", ascending=False)
)
# average ranking by title



Unnamed: 0_level_0,Rank
Title,Unnamed: 1_level_1
Scary Stories to Tell in th…,10.000000
Double Jeopardy,10.000000
Dare Me: Season 1,10.000000
Dark Shadows,10.000000
White Christmas,10.000000
...,...
Bad Trip,2.333333
Inventing Anna,2.071429
Below Zero,2.000000
The Woman in the Window,1.909091


In [62]:
# do the same with groupby

(
    netflix.groupby(["Title"])
    [["Rank"]]
    .mean()
    .sort_values(by="Rank", ascending=False)
)


Unnamed: 0_level_0,Rank
Title,Unnamed: 1_level_1
Scary Stories to Tell in th…,10.000000
Double Jeopardy,10.000000
Dare Me: Season 1,10.000000
Dark Shadows,10.000000
White Christmas,10.000000
...,...
Bad Trip,2.333333
Inventing Anna,2.071429
Below Zero,2.000000
The Woman in the Window,1.909091


In [63]:
# aggregate a column with more than one measure

# "min", "max", "mean"

(
    netflix.groupby(["Title"])
    [["Rank"]]
    .agg(["min","max","mean"])
)

Unnamed: 0_level_0,Rank,Rank,Rank
Unnamed: 0_level_1,min,max,mean
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
#Alive,3,8,5.333333
#blackAF,5,10,6.625000
(Un)Well,7,7,7.000000
13 Reasons Why,1,10,3.636364
17 Again,5,8,6.285714
...,...,...,...
Your Highness,9,9,9.000000
Zookeeper,7,9,8.000000
iCarly,2,10,5.190476
jeen-yuhs: A Kanye Trilogy,3,10,6.166667


In [65]:
# pivot table more than one measure use quotes non built in


(
    netflix.pivot_table(
    index="Title",
    values="Rank",
    aggfunc=[min, max, "mean"]
    )
)

Unnamed: 0_level_0,min,max,mean
Unnamed: 0_level_1,Rank,Rank,Rank
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
#Alive,3,8,5.333333
#blackAF,5,10,6.625000
(Un)Well,7,7,7.000000
13 Reasons Why,1,10,3.636364
17 Again,5,8,6.285714
...,...,...,...
Your Highness,9,9,9.000000
Zookeeper,7,9,8.000000
iCarly,2,10,5.190476
jeen-yuhs: A Kanye Trilogy,3,10,6.166667


In [69]:
# aggregation more than a value  column pivot table version

# mean and count

(
     netflix.pivot_table(
    index="Title",
    values="Rank",
    aggfunc={
        "Rank": ["mean"],
        "Title": ["count"]
    }
    )
    .sort_values(by="mean")
)

Unnamed: 0_level_0,mean,count
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Pieces of Her,1.285714,7
The Woman in the Window,1.909091,11
Below Zero,2.000000,6
Inventing Anna,2.071429,28
Bad Trip,2.333333,9
...,...,...
Community,10.000000,2
Bo Burnham: Inside,10.000000,2
Black Island,10.000000,2
60 Days In,10.000000,4


In [70]:
# aggregation more than a value  column groupby version

(
    netflix.groupby(["Title"])
    [["Rank","Title"]]
    .agg(
    {
        "Rank": "mean",
        "Title": "count"
    }
    )
)


Unnamed: 0_level_0,Rank,Title
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
#Alive,5.333333,6
#blackAF,6.625000,8
(Un)Well,7.000000,6
13 Reasons Why,3.636364,22
17 Again,6.285714,7
...,...,...
Your Highness,9.000000,1
Zookeeper,8.000000,3
iCarly,5.190476,42
jeen-yuhs: A Kanye Trilogy,6.166667,12


In [71]:
# melting

wide_data = pd.DataFrame({
    'name':['Adam ', 'Bob ', 'Dave ', 'Fred '],
    'age': [15, 16, 16, 15],
    'test1': [95, 81, 89, None],
    'test2': [80, 82, 84, 88],
    'teacher': ['Ashby ', 'Ashby ', 'Jones ', 'Jones ']})

In [72]:
# view wide data

wide_data

Unnamed: 0,name,age,test1,test2,teacher
0,Adam,15,95.0,80,Ashby
1,Bob,16,81.0,82,Ashby
2,Dave,16,89.0,84,Jones
3,Fred,15,,88,Jones


In [75]:
# melt to tidy format

tity_data = (
    wide_data
    .melt(
          id_vars=["name","age","teacher"], 
          value_vars=["test1","test2"], 
          var_name="test", value_name="score")
             )

tity_data

Unnamed: 0,name,age,teacher,test,score
0,Adam,15,Ashby,test1,95.0
1,Bob,16,Ashby,test1,81.0
2,Dave,16,Jones,test1,89.0
3,Fred,15,Jones,test1,
4,Adam,15,Ashby,test2,80.0
5,Bob,16,Ashby,test2,82.0
6,Dave,16,Jones,test2,84.0
7,Fred,15,Jones,test2,88.0


In [78]:
# join

# load data


customer = pd.read_csv("customer.csv")
address = pd.read_csv("address.csv")
customer
address

Unnamed: 0,CustomerID,AddressID,AddressType
0,29485,1086,Main Office
1,29486,621,Main Office
2,29489,1069,Main Office
3,29490,887,Main Office
4,29492,618,Main Office
...,...,...,...
412,30113,653,Main Office
413,30115,499,Main Office
414,30116,1044,Main Office
415,30117,596,Main Office


In [80]:
# Join inner
address.merge(customer, how="left", on="CustomerID")


Unnamed: 0,CustomerID,AddressID,AddressType,Title,FirstName,LastName
0,29485,1086,Main Office,Ms.,Catherine,Abel
1,29486,621,Main Office,Ms.,Kim,Abercrombie
2,29489,1069,Main Office,Ms.,Frances,Adams
3,29490,887,Main Office,Ms.,Margaret,Smith
4,29492,618,Main Office,Mr.,Jay,Adams
...,...,...,...,...,...,...
412,30113,653,Main Office,Mr.,Raja,Venugopal
413,30115,499,Main Office,Ms.,Dora,Verdad
414,30116,1044,Main Office,Ms.,Wanda,Vernon
415,30117,596,Main Office,Mr.,Robert,Vessa


In [83]:
# union

address_top100 = address[:100]
address_rest = address[100:]
address_rest

Unnamed: 0,CustomerID,AddressID,AddressType
100,29643,899,Main Office
101,29644,643,Main Office
102,29645,850,Main Office
103,29646,578,Main Office
104,29649,544,Main Office
...,...,...,...
412,30113,653,Main Office
413,30115,499,Main Office
414,30116,1044,Main Office
415,30117,596,Main Office


In [84]:

# concat

pd.concat([address_top100, address_rest])

Unnamed: 0,CustomerID,AddressID,AddressType
0,29485,1086,Main Office
1,29486,621,Main Office
2,29489,1069,Main Office
3,29490,887,Main Office
4,29492,618,Main Office
...,...,...,...
412,30113,653,Main Office
413,30115,499,Main Office
414,30116,1044,Main Office
415,30117,596,Main Office


### Recommended Resources


- [data school](https://www.dataschool.io/author/kevin-markham/)
- [pandas documentation](https://pandas.pydata.org/docs/)
- [effective pandas - book](https://www.amazon.com.au/Effective-Pandas-Patterns-Manipulation-Treading/dp/B09MYXXSFM/ref=asc_df_B09MYXXSFM/?tag=googleshopdsk-22&linkCode=df0&hvadid=463879774783&hvpos=&hvnetw=g&hvrand=1208302731783382154&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9070881&hvtargid=pla-1599278295760&psc=1)
- [markdown cheat sheet](https://www.markdownguide.org/cheat-sheet/)