# Getting and Knowing Data

In [20]:
import pandas as pd
import numpy as np

## Step 1:Import Dataset



In [21]:
df=pd.read_csv("chipotle.csv",sep="\t")

In [22]:
print(df.head(5))
print(df.tail(5))

   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   

                                  choice_description item_price  
0                                                NaN     $2.39   
1                                       [Clementine]     $3.39   
2                                            [Apple]     $3.39   
3                                                NaN     $2.39   
4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...    $16.98   
      order_id  quantity           item_name  \
4617      1833         1       Steak Burrito   
4618      1833         1       Steak Burrito   
4619      1834         1  Chicken Salad Bowl   
4620      1834         1  C

## Step 2: Dataset Overview

In [23]:
df.shape

(4622, 5)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [25]:
list(df.columns)

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

In [26]:
df.describe()

Unnamed: 0,order_id,quantity
count,4622.0,4622.0
mean,927.254868,1.075725
std,528.890796,0.410186
min,1.0,1.0
25%,477.25,1.0
50%,926.0,1.0
75%,1393.0,1.0
max,1834.0,15.0


In [27]:
df.describe(include="all")

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
count,4622.0,4622.0,4622,3376,4622
unique,,,50,1043,78
top,,,Chicken Bowl,[Diet Coke],$8.75
freq,,,726,134,730
mean,927.254868,1.075725,,,
std,528.890796,0.410186,,,
min,1.0,1.0,,,
25%,477.25,1.0,,,
50%,926.0,1.0,,,
75%,1393.0,1.0,,,


### loc vs iloc

In [28]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [29]:
df.loc[(df['quantity']>=5) | (df['item_name']=='Nantucket Nectar'),['order_id','quantity','item_price']]

Unnamed: 0,order_id,quantity,item_price
2,1,1,$3.39
22,11,1,$3.39
105,46,1,$3.39
173,77,1,$3.39
205,91,1,$3.39
436,189,1,$3.39
601,247,2,$6.78
925,381,1,$3.39
1356,553,1,$3.39
1585,641,1,$3.39


In [30]:
df.iloc[[9]]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [31]:
df.iloc[3:11,:-1]

Unnamed: 0,order_id,quantity,item_name,choice_description
3,1,1,Chips and Tomatillo-Green Chili Salsa,
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans..."
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou..."
6,3,1,Side of Chips,
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables..."
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch..."
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto..."
10,5,1,Chips and Guacamole,


# Data Manipulation

In [32]:
df['item_price'].dtype

dtype('O')

### Apply

In [33]:
df['item_price']=df['item_price'].apply(lambda x:float(x.replace('$','')))

In [34]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [35]:
df['total_price']=df['quantity']*df['item_price']

df['total_price'].sum()

### Which was the most-ordered item?

## Group By

In [36]:
c=df.groupby("item_name")["quantity"].sum()
c.sort_values(ascending=False)

item_name
Chicken Bowl                             761
Chicken Burrito                          591
Chips and Guacamole                      506
Steak Burrito                            386
Canned Soft Drink                        351
Chips                                    230
Steak Bowl                               221
Bottled Water                            211
Chips and Fresh Tomato Salsa             130
Canned Soda                              126
Chicken Salad Bowl                       123
Chicken Soft Tacos                       120
Side of Chips                            110
Veggie Burrito                            97
Barbacoa Burrito                          91
Veggie Bowl                               87
Carnitas Bowl                             71
Barbacoa Bowl                             66
Carnitas Burrito                          60
Steak Soft Tacos                          56
6 Pack Soft Drink                         55
Chips and Tomatillo Red Chili Salsa       50


In [37]:
#Unique value
df['item_name'].value_counts().count()

50

In [38]:
df['item_name'].nunique()

50

In [39]:
print(1+1)

2
