In [1]:
import numpy as np
import pandas as pd
pd.set_option('float_format', '{:.2f}'.format)
# leave out the last line to display the default decimal points

In [5]:
df = pd.read_csv('./course-data/tips.csv')

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [8]:
# we want to grad the last in number of digits of the 'CC Number'
# problem int object in not subscriptable, i.e. it cannot be indexed
# -> we need to cast it to a string and use slice notation

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 21.1+ KB


In [9]:
def last_four(arg):
    return str(arg)[-4:]

# we cound recasted as an interger if neccesary int(str(arg)[-4:])

In [13]:
df['last_four'] = df['CC Number'].apply(last_four)

In [16]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221


In [17]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.79,3.0,2.57,7.89,2563495562019613.0
std,8.9,1.38,0.95,2.91,2369339882289543.5
min,3.07,1.0,1.0,2.88,60406789937.0
25%,13.35,2.0,2.0,5.8,30407308242440.0
50%,17.8,2.9,2.0,7.25,3525317610005860.5
75%,24.13,3.56,3.0,9.39,4553675399499020.0
max,50.81,10.0,6.0,20.27,6596453823950595.0


In [19]:
def yelp(price): 
    if price < 15:
        return '$'
    elif 15 <= price < 25: 
        return '$$'
    else: 
        return '$$$'

# we transform a continous variable X:S->[0,N] into a categorical feature Y:S->{$,$$,$$$}

In [21]:
df['yelp'] = df['total_bill'].apply(yelp)

In [22]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four,yelp
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410,$$
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230,$
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322,$$
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994,$$
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221,$$


In [23]:
df['total_bill'].apply(lambda bill: bill*2)

0     33.98
1     20.68
2     42.02
3     47.36
4     49.18
       ... 
239   58.06
240   54.36
241   45.34
242   35.64
243   37.56
Name: total_bill, Length: 244, dtype: float64

In [60]:
def quality_of_tip(total_bill, tip): 
    if tip/total_bill > 0.20: 
        return 'Generous'
    else:
        return 'Other'

In [70]:
df.drop('quality_of_bill', axis = 1)
df['quality'] = df[['total_bill','tip']].apply(lambda df: quality_of_tip(df['total_bill'], df['tip']), axis = 1)

In [71]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four,yelp,quality_of_bill,quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410,$$,Other,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230,$,Other,Other
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322,$$,Other,Other
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994,$$,Other,Other
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221,$$,Other,Other


In [72]:
len(df[df['quality']=='Generous']) / len(df) *100
# check what percentage of customers are considered generous

15.983606557377051

In [73]:
df['quality_vec'] = np.vectorize(quality_of_tip)(df['total_bill'],df['tip'])
# with the vectorize function make functions NumPy-aware 

In [74]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four,yelp,quality_of_bill,quality,quality_vec
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410,$$,Other,Other,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230,$,Other,Other,Other
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322,$$,Other,Other,Other
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994,$$,Other,Other,Other
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221,$$,Other,Other,Other


In [75]:
# we can compare the performance of both functions with the time-it module
import timeit

# code snippet to be executed only once 
setup = '''
import numpy as np
import pandas as pd
df = pd.read_csv('./course-data/tips.csv')
def quality(total_bill,tip):
    if tip/total_bill  > 0.25:
        return "Generous"
    else:
        return "Other"
'''

  
# code snippet whose execution time is to be measured 
stmt_one = ''' 
df['Tip Quality'] = df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)
'''

stmt_two = '''
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])
'''

In [81]:
p1 = timeit.timeit(setup = setup, stmt = stmt_one, number = 1000) 
p1

4.271549565999521

In [83]:
p2 = timeit.timeit(setup = setup, stmt = stmt_two, number = 1000) 
p2

0.4384371679989272

In [86]:
p1/p2
# the second operation is factor 9.74 faster

9.742672103953499

In [88]:
df = pd.read_csv('./course-data/tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [90]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.79,8.9,3.07,13.35,17.8,24.13,50.81
tip,244.0,3.0,1.38,1.0,2.0,2.9,3.56,10.0
size,244.0,2.57,0.95,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.89,2.91,2.88,5.8,7.25,9.39,20.27
CC Number,244.0,2563495562019613.0,2369339882289543.5,60406789937.0,30407308242440.0,3525317610005860.5,4553675399499020.0,6596453823950595.0


In [95]:
df.sort_values('tip', ascending = False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.00,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
...,...,...,...,...,...,...,...,...,...,...,...
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
111,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455


In [97]:
df.sort_values(['tip','size'])
# if there are duplicate tip values, then sort by size

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
111,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
92,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
...,...,...,...,...,...,...,...,...,...,...,...
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [98]:
df['total_bill']

0     16.99
1     10.34
2     21.01
3     23.68
4     24.59
       ... 
239   29.03
240   27.18
241   22.67
242   17.82
243   18.78
Name: total_bill, Length: 244, dtype: float64

In [99]:
df['total_bill'].max()

50.81

In [101]:
df['total_bill'].idxmax()
# returns only one value! 
# df[df['total_bill']==df['total_bill'].max()] might be a better alternative if 
# column variable shows up multiple times

170

In [104]:
df.iloc[170]

total_bill                     50.81
tip                            10.00
sex                             Male
smoker                           Yes
day                              Sat
time                          Dinner
size                               3
price_per_person               16.94
Payer Name             Gregory Clark
CC Number           5473850968388236
Payment ID                   Sat1954
Name: 170, dtype: object

In [106]:
df.loc[df['total_bill'].idxmin()]
# grabs the row where the total_bill value is the minimum

total_bill                      3.07
tip                             1.00
sex                           Female
smoker                           Yes
day                              Sat
time                          Dinner
size                               1
price_per_person                3.07
Payer Name             Tiffany Brock
CC Number           4359488526995267
Payment ID                   Sat3455
Name: 67, dtype: object

In [111]:
df.corr()
# only works with numberic variables

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.68,0.6,0.65,0.1
tip,0.68,1.0,0.49,0.35,0.11
size,0.6,0.49,1.0,-0.18,-0.03
price_per_person,0.65,0.35,-0.18,1.0,0.14
CC Number,0.1,0.11,-0.03,0.14,1.0


In [112]:
df['sex'].value_counts()
# only makes sense in categorical columns

Male      157
Female     87
Name: sex, dtype: int64

In [113]:
df['day'].unique()
# unique values in the column
# df['day'].nunique() return the number of unique values and is equivalent to
# len(df['day'].unique())
# nevertheless value count displays the most information for the user

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [114]:
df['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [115]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [117]:
df['sex'].replace(['Female','Male'],['F','M'])
# replace values in a column
# this is recommended if we're just replacing a single value

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [118]:
mymap = {'Female':'F', 'Male':'M'}
df['sex'].map(mymap)
# this is recommended if we're replacing multiple values

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [120]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

In [129]:
simple_df = pd.DataFrame([1,2,2,2],['a','b','c','d'])
simple_df

Unnamed: 0,0
a,1
b,2
c,2
d,2


In [130]:
simple_df.duplicated()
# this can be used as a filter

a    False
b    False
c     True
d     True
dtype: bool

In [131]:
simple_df.drop_duplicates()

Unnamed: 0,0
a,1
b,2


In [134]:
df['total_bill'].between(10,20,inclusive = True)
# this can be used as a filter

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

In [136]:
df[df['total_bill'].between(10,20,inclusive = True)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
8,15.04,1.96,Male,No,Sun,Dinner,2,7.52,Joseph Mcdonald,3522866365840377,Sun6820
9,14.78,3.23,Male,No,Sun,Dinner,2,7.39,Jerome Abbott,3532124519049786,Sun3775
10,10.27,1.71,Male,No,Sun,Dinner,2,5.14,William Riley,566287581219,Sun2546
...,...,...,...,...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2,7.76,Tracy Douglas,4097938155941930,Sat7220
235,10.07,1.25,Male,No,Sat,Dinner,2,5.04,Sean Gonzalez,3534021246117605,Sat4615
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


In [138]:
df.nlargest(10,'tip')
# sorting & head
# equivalent to df.sort_value('tip', ascending=False).iloc[0:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
141,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
183,23.17,6.5,Male,Yes,Sun,Dinner,4,5.79,Dr. Michael James,4718501859162,Sun6059
214,28.17,6.5,Female,Yes,Sat,Dinner,3,9.39,Marissa Jackson,4922302538691962,Sat3374
47,32.4,6.0,Male,No,Sun,Dinner,4,8.1,James Barnes,3552002592874186,Sun9677
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
88,24.71,5.85,Male,No,Thur,Lunch,2,12.36,Roger Taylor,4410248629955,Thur9003


In [139]:
df.nsmallest(10,'tip')
# equivalent to df.sort_value('tip', ascending=True).iloc[0:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
111,7.25,1.0,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
236,12.6,1.0,Male,Yes,Sat,Dinner,2,6.3,Matthew Myers,3543676378973965,Sat5032
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
215,12.9,1.1,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Sat6983
237,32.83,1.17,Male,Yes,Sat,Dinner,2,16.42,Thomas Brown,4284722681265508,Sat2929
75,10.51,1.25,Male,No,Sat,Dinner,2,5.26,Kenneth Hayes,213142079731108,Sat5056
135,8.51,1.25,Female,No,Thur,Lunch,2,4.26,Rebecca Harris,4320272020376174,Thur6600
235,10.07,1.25,Male,No,Sat,Dinner,2,5.04,Sean Gonzalez,3534021246117605,Sat4615


In [141]:
df.sample(5)
# return file random points of the DataFrame

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
11,35.26,5.0,Female,No,Sun,Dinner,4,8.82,Diane Macias,4577817359320969,Sun6686
6,8.77,2.0,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344,Sun5985
114,25.71,4.0,Female,No,Sun,Dinner,3,8.57,Katie Smith,5400160161311292,Sun6492
81,16.66,3.4,Male,No,Thur,Lunch,2,8.33,William Martin,4550549048402707,Thur8232
205,16.47,3.23,Female,Yes,Thur,Lunch,3,5.49,Carly Reyes,4787787236486,Thur8084


In [142]:
df.sample(frac=0.1)
# samples 10% of the rows in the DataFrame

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
177,14.48,2.0,Male,Yes,Sun,Dinner,2,7.24,John Dudley,4565183162071073,Sun6203
130,19.08,1.5,Male,No,Thur,Lunch,2,9.54,Seth Sexton,213113680829581,Thur1446
204,20.53,4.0,Male,Yes,Thur,Lunch,4,5.13,Scott Kim,3570611756827620,Thur2160
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
142,41.19,5.0,Male,No,Thur,Lunch,5,8.24,Eric Andrews,4356531761046453,Thur3621
152,17.26,2.74,Male,No,Sun,Dinner,3,5.75,Gregory Smith,4292362333741,Sun5205
118,12.43,1.8,Female,No,Thur,Lunch,2,6.22,Dr. Caroline Tucker,502047186908,Thur8084
205,16.47,3.23,Female,Yes,Thur,Lunch,3,5.49,Carly Reyes,4787787236486,Thur8084
121,13.42,1.68,Female,No,Thur,Lunch,2,6.71,Laura Garcia,5181484390945653,Thur2158
197,43.11,5.0,Female,Yes,Thur,Lunch,4,10.78,Brooke Soto,5544902205760175,Thur9313
