In [1]:
import pandas as pd

#### Series
A pandas series is a one-dimensional data structure that comprises of a key-value pair. It is similar to a python dictionary, except it provides more freedom to manipulate and edit the data.

To initialize a series, use pd.Series():

In [31]:
factors_of_12 = pd.Series([1,2,4,6,12], name = "factors of 12")

print("FACTORS OF 12 SERIES")
print(factors_of_12, "\n")

FACTORS OF 12 SERIES
0     1
1     2
2     4
3     6
4    12
Name: factors of 12, dtype: int64 



In [30]:
#FLOAT SERIES
temperature = pd.Series([32.6, 34.1, 28.0, 35.9], index = ["a","b","c","d"])

print("TEMPERATURE IN CELSIUS")
print(temperature, "\n")

TEMPERATURE IN CELSIUS
a    32.6
b    34.1
c    28.0
d    35.9
dtype: float64 



In [41]:
fruits = pd.Series(["apples", "oranges", "bananas"])
temperature = pd.Series([32.6, 34.1, 28.0, 35.9], index = ["a","b","c","d"])

##### QUERY #####

#USING INDEX
print ("2nd fruit: ", fruits.loc[1])    # iloc: location at integer key=1  
#OR
print ("2nd fruit: ", fruits[1], "\n")

#USING KEY
print ("temperature at key \"b\": ", temperature.loc["b"])    # loc: location at key='b'

2nd fruit:  oranges
2nd fruit:  oranges 

temperature at key "b":  34.1


#### Dataframe
A pandas dataframe is a two-dimensional data-structure that can be thought of as a spreadsheet. A dataframe can also be thought of as a combination of two or more series.

To initialize a dataframe, use pd.DataFrame:

In [47]:
fruits_jack = ["apples", "oranges", "bananas"]
fruits_john = ["guavas", "kiwis", "strawberries"]
index = ["a", "b", "c"]
all_fruits = {"Jack's": fruits_jack, "John's": fruits_john}

fruits = pd.DataFrame(all_fruits, index = index)
print(fruits,"\n")

new_fruits = fruits.reset_index(drop = True)
print(new_fruits)
#The indexes can be changed back to conventional integer indexes using .reset_index() method. 
#The drop = True argument is passed to drop the alphabetical index column; otherwise, the alphabetical index column is added 
#to the dataframe as a new column by default.

    Jack's        John's
a   apples        guavas
b  oranges         kiwis
c  bananas  strawberries 

  index   Jack's        John's
0     a   apples        guavas
1     b  oranges         kiwis
2     c  bananas  strawberries


In [64]:
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep='\t')

In [65]:
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


In [5]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [18]:
df.dtypes == 'object'

order_id              False
quantity              False
item_name              True
choice_description     True
item_price             True
dtype: bool

In [25]:
df.dtypes[df.dtypes == 'object'].index

Index(['item_name', 'choice_description', 'item_price'], dtype='object')

In [12]:
m = df.dtypes[df.dtypes == 'object'].index
df[m].describe()

Unnamed: 0,item_name,choice_description,item_price
count,4622,3376,4622
unique,50,1043,78
top,Chicken Bowl,[Diet Coke],$8.75
freq,726,134,730


In [13]:
df.item_name.value_counts()

Chicken Bowl                             726
Chicken Burrito                          553
Chips and Guacamole                      479
Steak Burrito                            368
Canned Soft Drink                        301
Steak Bowl                               211
Chips                                    211
Bottled Water                            162
Chicken Soft Tacos                       115
Chips and Fresh Tomato Salsa             110
Chicken Salad Bowl                       110
Canned Soda                              104
Side of Chips                            101
Veggie Burrito                            95
Barbacoa Burrito                          91
Veggie Bowl                               85
Carnitas Bowl                             68
Barbacoa Bowl                             66
Carnitas Burrito                          59
Steak Soft Tacos                          55
6 Pack Soft Drink                         54
Chips and Tomatillo Red Chili Salsa       48
Chicken Cr

In [14]:
df.describe()

Unnamed: 0,order_id,quantity
count,4622.0,4622.0
mean,927.254868,1.075725
std,528.890796,0.410186
min,1.0,1.0
25%,477.25,1.0
50%,926.0,1.0
75%,1393.0,1.0
max,1834.0,15.0


In [27]:
# single square bracket will give you series while double square bracket gives you DataFrame.
df[['order_id','quantity','item_name']]

Unnamed: 0,order_id,quantity,item_name
0,1,1,Chips and Fresh Tomato Salsa
1,1,1,Izze
2,1,1,Nantucket Nectar
3,1,1,Chips and Tomatillo-Green Chili Salsa
4,2,2,Chicken Bowl
...,...,...,...
4617,1833,1,Steak Burrito
4618,1833,1,Steak Burrito
4619,1834,1,Chicken Salad Bowl
4620,1834,1,Chicken Salad Bowl


In [28]:
df.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

In [51]:
sorted(df["item_name"])[5:3000:500]    # sorted in alphabetical order

['6 Pack Soft Drink',
 'Canned Soda',
 'Carnitas Soft Tacos',
 'Chicken Bowl',
 'Chicken Burrito',
 'Chicken Soft Tacos']

In [50]:
len(df['item_name'])

4622

In [53]:
df['choice_description'].describe()

count            3376
unique           1043
top       [Diet Coke]
freq              134
Name: choice_description, dtype: object

In [58]:
df[['item_price']][:3000:400]

Unnamed: 0,item_price
0,$2.39
400,$11.25
800,$10.98
1200,$9.25
1600,$4.30
2000,$9.25
2400,$9.25
2800,$9.25


In [74]:
df.describe(include = "all")

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1
count,4622.0,4622.0,4622,3376,4622,4622,4622
unique,,,50,1043,78,1,50
top,,,Chicken Bowl,[Diet Coke],$8.75,irfan,Chicken Bowlirfan
freq,,,726,134,730,4622,726
mean,927.254868,1.075725,,,,,
std,528.890796,0.410186,,,,,
min,1.0,1.0,,,,,
25%,477.25,1.0,,,,,
50%,926.0,1.0,,,,,
75%,1393.0,1.0,,,,,


In [75]:
df["mohd"] = "irfan"
df.head(3)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,Chips and Fresh Tomato Salsairfan
1,1,1,Izze,[Clementine],$3.39,irfan,Izzeirfan
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,Nantucket Nectarirfan


In [76]:
df["col1"] = df["item_name"] + df["mohd"]
df.head(3)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,Chips and Fresh Tomato Salsairfan
1,1,1,Izze,[Clementine],$3.39,irfan,Izzeirfan
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,Nantucket Nectarirfan


In [73]:
type(df['item_price'][0])

str

In [80]:
cat = df.dtypes[df.dtypes == object].index
cat

Index(['item_name', 'choice_description', 'item_price', 'mohd', 'col1'], dtype='object')

In [82]:
ctg = pd.Categorical(df['choice_description'])
type(ctg)

pandas.core.arrays.categorical.Categorical

In [85]:
df['item_name'].unique()

array(['Chips and Fresh Tomato Salsa', 'Izze', 'Nantucket Nectar',
       'Chips and Tomatillo-Green Chili Salsa', 'Chicken Bowl',
       'Side of Chips', 'Steak Burrito', 'Steak Soft Tacos',
       'Chips and Guacamole', 'Chicken Crispy Tacos',
       'Chicken Soft Tacos', 'Chicken Burrito', 'Canned Soda',
       'Barbacoa Burrito', 'Carnitas Burrito', 'Carnitas Bowl',
       'Bottled Water', 'Chips and Tomatillo Green Chili Salsa',
       'Barbacoa Bowl', 'Chips', 'Chicken Salad Bowl', 'Steak Bowl',
       'Barbacoa Soft Tacos', 'Veggie Burrito', 'Veggie Bowl',
       'Steak Crispy Tacos', 'Chips and Tomatillo Red Chili Salsa',
       'Barbacoa Crispy Tacos', 'Veggie Salad Bowl',
       'Chips and Roasted Chili-Corn Salsa',
       'Chips and Roasted Chili Corn Salsa', 'Carnitas Soft Tacos',
       'Chicken Salad', 'Canned Soft Drink', 'Steak Salad Bowl',
       '6 Pack Soft Drink', 'Chips and Tomatillo-Red Chili Salsa', 'Bowl',
       'Burrito', 'Crispy Tacos', 'Carnitas Crispy Tacos

In [86]:
# Any column where we want to implement order, we can convert it to categorical variable and define the order.
pd.Categorical(["high", "higher", "high","highest","higher","high"], categories=["high", "higher","highest"], ordered=True)

['high', 'higher', 'high', 'highest', 'higher', 'high']
Categories (3, object): ['high' < 'higher' < 'highest']

In [88]:
df['quantity'].unique()
# OR
#df.quantity.unique()

array([ 1,  2,  3,  4,  5, 15,  7,  8, 10], dtype=int64)

In [94]:
cat_col = pd.Categorical(df['quantity'])
cat_col

[1, 1, 1, 1, 2, ..., 1, 1, 1, 1, 1]
Length: 4622
Categories (9, int64): [1, 2, 3, 4, ..., 7, 8, 10, 15]

In [101]:
df['Category'] = cat_col
df[0:2000:30]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1,Category
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,Chips and Fresh Tomato Salsairfan,1
30,15,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,Chips and Tomatillo-Green Chili Salsairfan,1
60,28,1,Chips and Guacamole,,$4.45,irfan,Chips and Guacamoleirfan,1
90,40,1,Steak Bowl,"[Fresh Tomato Salsa, [Rice, Black Beans, Chees...",$11.75,irfan,Steak Bowlirfan,1
120,53,1,Chicken Burrito,"[Tomatillo Red Chili Salsa, [Rice, Black Beans...",$8.75,irfan,Chicken Burritoirfan,1
...,...,...,...,...,...,...,...,...
1860,754,1,6 Pack Soft Drink,[Diet Coke],$6.49,irfan,6 Pack Soft Drinkirfan,1
1890,763,1,Steak Bowl,"[Fresh Tomato Salsa, Rice]",$9.25,irfan,Steak Bowlirfan,1
1920,776,1,Chicken Bowl,"[Tomatillo Green Chili Salsa, [Rice, Black Bea...",$8.75,irfan,Chicken Bowlirfan,1
1950,789,1,Steak Burrito,"[Fresh Tomato Salsa (Mild), [Pinto Beans, Rice...",$8.99,irfan,Steak Burritoirfan,1


In [92]:
df['quantity'].value_counts()

1     4355
2      224
3       28
4       10
8        1
5        1
10       1
7        1
15       1
Name: quantity, dtype: int64

In [103]:
df.isnull().sum()

order_id                 0
quantity                 0
item_name                0
choice_description    1246
item_price               0
mohd                     0
col1                     0
Category                 0
dtype: int64

In [105]:
df.notnull().sum()

order_id              4622
quantity              4622
item_name             4622
choice_description    3376
item_price            4622
mohd                  4622
col1                  4622
Category              4622
dtype: int64

In [107]:
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1,Category
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,Chips and Fresh Tomato Salsairfan,1
1,1,1,Izze,[Clementine],$3.39,irfan,Izzeirfan,1
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,Nantucket Nectarirfan,1
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,Chips and Tomatillo-Green Chili Salsairfan,1
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,Chicken Bowlirfan,2
...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,Steak Burritoirfan,1
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,Steak Burritoirfan,1
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,Chicken Salad Bowlirfan,1
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,Chicken Salad Bowlirfan,1


In [112]:
# Dropping rows
df.drop([0,2,5])

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1,Category
1,1,1,Izze,[Clementine],$3.39,irfan,Izzeirfan,1
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,Chips and Tomatillo-Green Chili Salsairfan,1
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,Chicken Bowlirfan,2
6,3,1,Side of Chips,,$1.69,irfan,Side of Chipsirfan,1
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75,irfan,Steak Burritoirfan,1
...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,Steak Burritoirfan,1
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,Steak Burritoirfan,1
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,Chicken Salad Bowlirfan,1
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,Chicken Salad Bowlirfan,1


In [114]:
#Dropping columns  (axis = 0 for rows and axis=1 for colummns)
df.drop(['col1'], axis=1)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,1
1,1,1,Izze,[Clementine],$3.39,irfan,1
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,1
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,1
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,2
...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,1
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,1
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,1
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,1


In [None]:
The above drop operations will not affect the original dataframe df. To apply these changes in our original dataframe we
need to give another parameter in the drop, inplace = True.

In [118]:
cho = df['choice_description'][df['choice_description'].isnull()].index

In [121]:
df.iloc[cho]   #iloc: index location, cho is list of row index of missing values

#iloc accepts 2 parameters row index and column index. If we don't give second parameter i.e. column index or use : then it 
#gives all the columns.

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,col1,Category
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,Chips and Fresh Tomato Salsairfan,1
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,Chips and Tomatillo-Green Chili Salsairfan,1
6,3,1,Side of Chips,,$1.69,irfan,Side of Chipsirfan,1
10,5,1,Chips and Guacamole,,$4.45,irfan,Chips and Guacamoleirfan,1
14,7,1,Chips and Guacamole,,$4.45,irfan,Chips and Guacamoleirfan,1
...,...,...,...,...,...,...,...,...
4600,1827,1,Chips and Guacamole,,$4.45,irfan,Chips and Guacamoleirfan,1
4605,1828,1,Chips and Guacamole,,$4.45,irfan,Chips and Guacamoleirfan,1
4613,1831,1,Chips,,$2.15,irfan,Chipsirfan,1
4614,1831,1,Bottled Water,,$1.50,irfan,Bottled Waterirfan,1


In [122]:
df.iloc[cho,:4]  # gives columns 0 to 3.

Unnamed: 0,order_id,quantity,item_name,choice_description
0,1,1,Chips and Fresh Tomato Salsa,
3,1,1,Chips and Tomatillo-Green Chili Salsa,
6,3,1,Side of Chips,
10,5,1,Chips and Guacamole,
14,7,1,Chips and Guacamole,
...,...,...,...,...
4600,1827,1,Chips and Guacamole,
4605,1828,1,Chips and Guacamole,
4613,1831,1,Chips,
4614,1831,1,Bottled Water,


In [123]:
df.iloc[0:100,0:3]   # first 100 rows and 3 columns
# both row & column indexes in form of integer

Unnamed: 0,order_id,quantity,item_name
0,1,1,Chips and Fresh Tomato Salsa
1,1,1,Izze
2,1,1,Nantucket Nectar
3,1,1,Chips and Tomatillo-Green Chili Salsa
4,2,2,Chicken Bowl
...,...,...,...
95,42,1,Barbacoa Bowl
96,42,1,Chips and Guacamole
97,43,1,Carnitas Bowl
98,43,1,Chicken Burrito


In [127]:
df.drop(['col1'], axis=1, inplace=True)  # permanently drops column col1.

In [131]:
df.loc[0:5,('order_id','quantity','item_name')] # here column name cannot be integer, it should be actual names as present.

Unnamed: 0,order_id,quantity,item_name
0,1,1,Chips and Fresh Tomato Salsa
1,1,1,Izze
2,1,1,Nantucket Nectar
3,1,1,Chips and Tomatillo-Green Chili Salsa
4,2,2,Chicken Bowl
5,3,1,Chicken Bowl


In [133]:
df.loc[(1,3,4),('item_name', 'order_id')]  # it doesn't accepts one column name in parentheses.

Unnamed: 0,item_name,order_id
1,Izze,1
3,Chips and Tomatillo-Green Chili Salsa,1
4,Chicken Bowl,2


In [135]:
df.loc[(3,45,677,788),['item_name']]  # in square bracket it also accepts only one column name

Unnamed: 0,item_name
3,Chips and Tomatillo-Green Chili Salsa
45,Chicken Burrito
677,Veggie Bowl
788,Canned Soft Drink


In [136]:
# Let's change the row indexes.
df.index = df.item_name

In [137]:
df

Unnamed: 0_level_0,order_id,quantity,item_name,choice_description,item_price,mohd,Category
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Chips and Fresh Tomato Salsa,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,1
Izze,1,1,Izze,[Clementine],$3.39,irfan,1
Nantucket Nectar,1,1,Nantucket Nectar,[Apple],$3.39,irfan,1
Chips and Tomatillo-Green Chili Salsa,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,1
Chicken Bowl,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,2
...,...,...,...,...,...,...,...
Steak Burrito,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,1
Steak Burrito,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,1
Chicken Salad Bowl,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,1
Chicken Salad Bowl,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,1


In [142]:
df.loc[('Izze','Steak Burrito'),('order_id','item_price')]

Unnamed: 0_level_0,order_id,item_price
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Izze,1,$3.39
Izze,12,$3.39
Izze,21,$3.39
Izze,30,$3.39
Izze,155,$3.39
...,...,...
Steak Burrito,1807,$9.25
Steak Burrito,1829,$11.75
Steak Burrito,1830,$11.75
Steak Burrito,1833,$11.75


In [141]:
df.loc[['Izze','Steak Burrito'],['order_id']]

Unnamed: 0_level_0,order_id
item_name,Unnamed: 1_level_1
Izze,1
Izze,12
Izze,21
Izze,30
Izze,155
...,...
Steak Burrito,1807
Steak Burrito,1829
Steak Burrito,1830
Steak Burrito,1833


In [143]:
# If we want to reset our indexes back to integers.
df.reset_index(drop=True, inplace=True)

In [144]:
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,1
1,1,1,Izze,[Clementine],$3.39,irfan,1
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,1
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,1
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,2
...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,1
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,1
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,1
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,1


In [145]:
df.loc[[i for i in range(0,10)],['item_name','item_price','Category']]

Unnamed: 0,item_name,item_price,Category
0,Chips and Fresh Tomato Salsa,$2.39,1
1,Izze,$3.39,1
2,Nantucket Nectar,$3.39,1
3,Chips and Tomatillo-Green Chili Salsa,$2.39,1
4,Chicken Bowl,$16.98,2
5,Chicken Bowl,$10.98,1
6,Side of Chips,$1.69,1
7,Steak Burrito,$11.75,1
8,Steak Soft Tacos,$9.25,1
9,Steak Burrito,$9.25,1


In [164]:
df.loc[df.index[0:5], df.columns[0:5]]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [147]:
# Let's generate a new column
# But the length of the lsit should be same as original column length

df['square'] = [i**2 for i in range(0,4622)]

In [148]:
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category,square
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,1,0
1,1,1,Izze,[Clementine],$3.39,irfan,1,1
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,1,4
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,1,9
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,2,16
...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,1,21316689
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,1,21325924
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,1,21335161
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,1,21344400


In [149]:
#Filling missing values
# Fill all missing values of column choice_description with "Pizza"
df[['choice_description']].fillna('Pizza')

Unnamed: 0,choice_description
0,Pizza
1,[Clementine]
2,[Apple]
3,Pizza
4,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans..."
...,...
4617,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ..."
4618,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese..."
4619,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto..."
4620,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu..."


In [151]:
df[['choice_description']].fillna(min(df.choice_description))

TypeError: '<' not supported between instances of 'str' and 'float'

In [160]:
df['Category'][df['Category'].isnull() == True].index

Int64Index([], dtype='int64')

In [163]:
df.item_price.count()

4622

In [172]:
# Replace
df.replace({'$2.39' : '2.39'})

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category,square
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,irfan,1,0
1,1,1,Izze,[Clementine],$3.39,irfan,1,1
2,1,1,Nantucket Nectar,[Apple],$3.39,irfan,1,4
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,irfan,1,9
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,irfan,2,16
...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75,irfan,1,21316689
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75,irfan,1,21325924
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25,irfan,1,21335161
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75,irfan,1,21344400


##### Let's find the total cost of each item by multiplying quantity with item_price. 
- First we need to remove all the $ symbols from item_price column.
- Then convert the string data type to float.
- Multiply columns quantity & item_price and assign a new column 'total_cost'.

In [179]:
# Removing $ symbol using apply function and lambda function.
df['item_price'] = df.item_price.apply(lambda x: x[1:])

In [186]:
# Type-casting from str to float.
df['item_price'] = df['item_price'].astype(float)

In [188]:
# Checking
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category,square
0,1,1,Chips and Fresh Tomato Salsa,,2.39,irfan,1,0
1,1,1,Izze,[Clementine],3.39,irfan,1,1
2,1,1,Nantucket Nectar,[Apple],3.39,irfan,1,4
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,irfan,1,9
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,irfan,2,16
...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75,irfan,1,21316689
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75,irfan,1,21325924
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25,irfan,1,21335161
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75,irfan,1,21344400


In [192]:
# Creating a new column by multiplying.
df['total_cost'] = df['quantity'] * df['item_price']

In [193]:
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,mohd,Category,square,total_cost
0,1,1,Chips and Fresh Tomato Salsa,,2.39,irfan,1,0,2.39
1,1,1,Izze,[Clementine],3.39,irfan,1,1,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,irfan,1,4,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,irfan,1,9,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,irfan,2,16,33.96
...,...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75,irfan,1,21316689,11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75,irfan,1,21325924,11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25,irfan,1,21335161,11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75,irfan,1,21344400,8.75


In [194]:
# Rename column names.
df.rename(columns={'item_name':'items','item_price':'price'})

Unnamed: 0,order_id,quantity,items,choice_description,price,mohd,Category,square,total_cost
0,1,1,Chips and Fresh Tomato Salsa,,2.39,irfan,1,0,2.39
1,1,1,Izze,[Clementine],3.39,irfan,1,1,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,irfan,1,4,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,irfan,1,9,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,irfan,2,16,33.96
...,...,...,...,...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75,irfan,1,21316689,11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75,irfan,1,21325924,11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25,irfan,1,21335161,11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75,irfan,1,21344400,8.75
