In [3]:
# link used for this task
# https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html

# Git hub --> https://github.com/pandas-dev/pandas

In [60]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", -1)

  pd.set_option("display.max_colwidth", -1)


In [4]:
url = (
    "https://raw.github.com/pandas-dev"
    "/pandas/master/pandas/tests/io/data/csv/tips.csv"
)

tips = pd.read_csv(url)

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### SELECT

In [9]:
# SELECT total_bill, tip, smoker, time FROM tips;

tips[['total_bill', 'tip', 'smoker', 'time']]  # or

# tips.loc[:, ['total_bill', 'tip', 'smoker', 'time'] ]

Unnamed: 0,total_bill,tip,smoker,time
0,16.99,1.01,No,Dinner
1,10.34,1.66,No,Dinner
2,21.01,3.50,No,Dinner
3,23.68,3.31,No,Dinner
4,24.59,3.61,No,Dinner
...,...,...,...,...
239,29.03,5.92,No,Dinner
240,27.18,2.00,Yes,Dinner
241,22.67,2.00,Yes,Dinner
242,17.82,1.75,No,Dinner


In [10]:
# SELECT *, tip/total_bill as tip_rate FROM tips;

tips.assign(tip_rate = tips['tip']/tips['total_bill'])

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


#### WHERE

In [13]:
# SELECT * FROM tips WHERE time = 'Dinner';

tips[tips['time'] == 'Dinner']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [17]:
# SELECT *  FROM tips WHERE time = 'Dinner' AND tip > 5.00;

tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.0)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
116,29.93,5.07,Male,No,Sun,Dinner,4
155,29.85,5.14,Female,No,Sun,Dinner,5
170,50.81,10.0,Male,Yes,Sat,Dinner,3
172,7.25,5.15,Male,Yes,Sun,Dinner,2
181,23.33,5.65,Male,Yes,Sun,Dinner,2


In [19]:
# SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;

tips[(tips['size'] > 5) | (tips['total_bill'] > 45)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
59,48.27,6.73,Male,No,Sat,Dinner,4
125,29.8,4.2,Female,No,Thur,Lunch,6
141,34.3,6.7,Male,No,Thur,Lunch,6
143,27.05,5.0,Female,No,Thur,Lunch,6
156,48.17,5.0,Male,No,Sun,Dinner,6
170,50.81,10.0,Male,Yes,Sat,Dinner,3
182,45.35,3.5,Male,Yes,Sun,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4


In [20]:
# NULL checks

frame = pd.DataFrame(
    {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]}
)


In [21]:
frame

Unnamed: 0,col1,col2
0,A,F
1,B,
2,,G
3,C,H
4,D,I


In [23]:
# SELECT * FROM frame WHERE col2 IS NULL;

frame[frame['col2'].isna()]

Unnamed: 0,col1,col2
1,B,


In [25]:
# SELECT * FROM frame WHERE col1 IS NOT NULL;

frame[frame['col1'].notna()]

Unnamed: 0,col1,col2
0,A,F
1,B,
3,C,H
4,D,I


### GROUPBY

In [28]:
# SELECT sex, count(*) FROM tips GROUP BY sex;
# /*
# Female     87
# Male      157
# */

tips.groupby('sex').size().to_frame('count').reset_index()

Unnamed: 0,sex,count
0,Female,87
1,Male,157


In [32]:
# SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
# /*
# Fri   2.734737   19
# Sat   2.993103   87
# Sun   3.255132   76
# Thu  2.771452   62
# */

tips.groupby('day').agg({'tip':np.mean, 'day':np.size})

Unnamed: 0_level_0,tip,day
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.734737,19
Sat,2.993103,87
Sun,3.255132,76
Thur,2.771452,62


In [35]:
# SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
# /*
# smoker day
# No     Fri      4  2.812500
#        Sat     45  3.102889
#        Sun     57  3.167895
#        Thu    45  2.673778
# Yes    Fri     15  2.714000
#        Sat     42  2.875476
#        Sun     19  3.516842
#        Thu    17  3.030000
# */

tips.groupby(['smoker', 'day']).agg({'tip':[np.size, np.mean]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2
No,Fri,4,2.8125
No,Sat,45,3.102889
No,Sun,57,3.167895
No,Thur,45,2.673778
Yes,Fri,15,2.714
Yes,Sat,42,2.875476
Yes,Sun,19,3.516842
Yes,Thur,17,3.03


### JOIN

In [36]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})

df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

In [39]:
print(df1)
print()
print(df2)

  key     value
0   A -0.439579
1   B  0.847060
2   C -0.463022
3   D  0.401283

  key     value
0   B  0.402584
1   D  1.518904
2   D  1.112925
3   E  0.431224


In [45]:
# Inner join

# SELECT * FROM df1 INNER JOIN df2  ON df1.key = df2.key;

pd.merge(df1, df2, on='key')

Unnamed: 0,key,value_x,value_y
0,B,0.84706,0.402584
1,D,0.401283,1.518904
2,D,0.401283,1.112925


In [42]:
indexed_df2 = df2.set_index('key')

In [43]:
indexed_df2

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
B,0.402584
D,1.518904
D,1.112925
E,0.431224


In [44]:
pd.merge(df1, indexed_df2, left_on='key', right_index=True)

Unnamed: 0,key,value_x,value_y
1,B,0.84706,0.402584
3,D,0.401283,1.518904
3,D,0.401283,1.112925


In [47]:
# how = 'outer' --> for full join
# how = 'right' --> for right join
## Left outer

# SELECT * FROM df1 LEFT OUTER JOIN df2   ON df1.key = df2.key;

pd.merge(df1, df2, on='key', how='left') 

Unnamed: 0,key,value_x,value_y
0,A,-0.439579,
1,B,0.84706,0.402584
2,C,-0.463022,
3,D,0.401283,1.518904
4,D,0.401283,1.112925


### UNION

In [48]:
df1 = pd.DataFrame(
    {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)}
)


df2 = pd.DataFrame(
    {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}
)

In [50]:
print(df1)
print()
print(df2)

            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3

          city  rank
0      Chicago     1
1       Boston     4
2  Los Angeles     5


In [52]:
# SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
# /*
#          city  rank
#       Chicago     1
# San Francisco     2
# New York City     3
#       Chicago     1
#        Boston     4
#   Los Angeles     5
# */

pd.concat([df1, df2]).reset_index()

Unnamed: 0,index,city,rank
0,0,Chicago,1
1,1,San Francisco,2
2,2,New York City,3
3,0,Chicago,1
4,1,Boston,4
5,2,Los Angeles,5


In [57]:
# SELECT city, rank FROM df1 UNION SELECT city, rank FROM df2;
# -- notice that there is only one Chicago record this time
# /*
#          city  rank
#       Chicago     1
# San Francisco     2
# New York City     3
#        Boston     4
#   Los Angeles     5
# */

pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3
3,Boston,4
4,Los Angeles,5


### Top n Rows per group

In [66]:
# SELECT * FROM (
#   SELECT
#     t.*,
#     ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
#   FROM tips t
# )
# WHERE rn < 3
# ORDER BY day, rn;

(
    tips.assign(
    rn = tips.sort_values(['total_bill'], ascending=False)
    .groupby(['day'])
    .cumcount() + 1
    )
    .query("rn < 3")
    .sort_values(['day', 'rn'])
    .reset_index(drop=True)
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,rn
0,40.17,4.73,Male,Yes,Fri,Dinner,4,1
1,28.97,3.0,Male,Yes,Fri,Dinner,2,2
2,50.81,10.0,Male,Yes,Sat,Dinner,3,1
3,48.33,9.0,Male,No,Sat,Dinner,4,2
4,48.17,5.0,Male,No,Sun,Dinner,6,1
5,45.35,3.5,Male,Yes,Sun,Dinner,3,2
6,43.11,5.0,Female,Yes,Thur,Lunch,4,1
7,41.19,5.0,Male,No,Thur,Lunch,5,2


In [88]:
# SELECT * FROM (
#   SELECT
#     t.*,
#     RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
#   FROM tips t
#   WHERE tip < 2
# )
# WHERE rnk < 3
# ORDER BY sex, rnk;

(
    tips[tips['tip']<2]
    .assign(rnk = tips.groupby(['sex'])['tip'].rank(method='min'))
    .query("rnk < 3")
    .sort_values(['sex', 'rnk'])
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,rnk
67,3.07,1.0,Female,Yes,Sat,Dinner,1,1.0
92,5.75,1.0,Female,Yes,Fri,Dinner,2,1.0
111,7.25,1.0,Female,No,Sat,Dinner,1,1.0
236,12.6,1.0,Male,Yes,Sat,Dinner,2,1.0
237,32.83,1.17,Male,Yes,Sat,Dinner,2,2.0


In [73]:
# select *, min(total_bill) over(partition by day) as min_total_bill from tips



In [79]:
(
    tips.assign(
        min_total_bill = tips.groupby('day').cumcount()+1
    )
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,min_total_bill
0,16.99,1.01,Female,No,Sun,Dinner,2,1
1,10.34,1.66,Male,No,Sun,Dinner,3,2
2,21.01,3.5,Male,No,Sun,Dinner,3,3
3,23.68,3.31,Male,No,Sun,Dinner,2,4
4,24.59,3.61,Female,No,Sun,Dinner,4,5
5,25.29,4.71,Male,No,Sun,Dinner,4,6
6,8.77,2.0,Male,No,Sun,Dinner,2,7
7,26.88,3.12,Male,No,Sun,Dinner,4,8
8,15.04,1.96,Male,No,Sun,Dinner,2,9
9,14.78,3.23,Male,No,Sun,Dinner,2,10
