## Pandas

- Pandas is a high level data manipulation tool.
- It is built on the Numpy package and its key data structure is called the data frame.
- Data frames allow you to store and manipulate tabular data in rows of observations and columns of variables.

In [7]:
#See the value of multiple statements at once
#output for all commands in python jupyter notebook

import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
#There are several ways to create a data frame. One way is to use a dictionary

dict = {"country": ['Brazil','Russia','India','China','South Africa'],
           "capital":['Brasilia','Moscow','New Delhi','Beijing','Pretoria'],
               "area":[8.516,17.10,3.286,9.597,1.221],
                   "population":[200.4,143.5,1252,1357,52.98]}
print(dict)

{'country': ['Brazil', 'Russia', 'India', 'China', 'South Africa'], 'capital': ['Brasilia', 'Moscow', 'New Delhi', 'Beijing', 'Pretoria'], 'area': [8.516, 17.1, 3.286, 9.597, 1.221], 'population': [200.4, 143.5, 1252, 1357, 52.98]}


In [2]:
dict2 = {"country": ['USA','Singapore'],
           "capital":['Washington','Singapore'],
               "area":[12,0.643],
                   "population":[400,1.25]}
print(dict2)

{'country': ['USA', 'Singapore'], 'capital': ['Washington', 'Singapore'], 'area': [12, 0.643], 'population': [400, 1.25]}


In [4]:
import pandas
df2 = pandas.DataFrame(dict2)
df2

Unnamed: 0,country,capital,area,population
0,USA,Washington,12.0,400.0
1,Singapore,Singapore,0.643,1.25


In [7]:
df1 = pandas.DataFrame(dict)
df1

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [42]:
df1 # index is assigned to data frame from 0 to 4 . in the background it invokes an HTML formating option in python 

Unnamed: 0,country,capital,area,population
BZL,Brazil,Brasilia,8.516,200.4
RS,Russia,Moscow,17.1,143.5
IND,India,New Delhi,3.286,1252.0
CHN,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [8]:
frames = [df1, df2]

In [10]:
new_df = pandas.concat(frames)

In [11]:
new_df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98
0,USA,Washington,12.0,400.0
1,Singapore,Singapore,0.643,1.25


In [12]:
type(df1)

pandas.core.frame.DataFrame

In [13]:
df1.index # extracting the range of the index

RangeIndex(start=0, stop=5, step=1)

In [12]:
new_index = ['BZL','RS','IND','CHN','SA'] # creating a list of the country code using a list variable

df1.index = new_index

df1.index

Index(['BZL', 'RS', 'IND', 'CHN', 'SA'], dtype='object')

In [13]:
df1

Unnamed: 0,country,capital,area,population
BZL,Brazil,Brasilia,8.516,200.4
RS,Russia,Moscow,17.1,143.5
IND,India,New Delhi,3.286,1252.0
CHN,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


### Load data from a file

In [19]:
cars_df = pandas.read_csv(r'car_data.csv') # pass the path as raw string

In [16]:
#cars_df = pd.read_csv('car_data.csv') can use this as well

In [15]:
cars_df

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,13495
1,2,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
2,3,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,ohcv,six,152,mpfi,9.0,154,5000,19,26,16500
3,4,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950
4,5,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,ohc,five,136,mpfi,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,23,28,16845
201,202,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,8.7,160,5300,19,25,19045
202,203,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohcv,six,173,mpfi,8.8,134,5500,18,23,21485
203,204,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,six,145,idi,23.0,106,4800,26,27,22470


### Data frame properties

In [20]:
cars_df.shape

(205, 23)

In [21]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         205 non-null    int64  
 1   make               205 non-null    object 
 2   fuel_type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num_of_doors       205 non-null    object 
 5   body_style         205 non-null    object 
 6   drive_wheels       205 non-null    object 
 7   engine_location    205 non-null    object 
 8   wheel_base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb_weight        205 non-null    int64  
 13  engine_type        205 non-null    object 
 14  num_of_cylinders   205 non-null    object 
 15  engine_size        205 non-null    int64  
 16  fuel_system        205 non

In [26]:
cars_df.head() # by default it dispalys 5 ; if u want a specific number of rows just pass it as an argumnt to the head()

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
200,201,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,23,28,16845
201,202,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,8.7,160,5300,19,25,19045
202,203,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohcv,six,173,mpfi,8.8,134,5500,18,23,21485
203,204,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,six,145,idi,23.0,106,4800,26,27,22470
204,205,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,19,25,22625


In [14]:
cars_df.tail(3)

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
202,203,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohcv,six,173,mpfi,8.8,134,5500,18,23,21485
203,204,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,six,145,idi,23.0,106,4800,26,27,22470
204,205,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,19,25,22625


In [15]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         205 non-null    int64  
 1   make               205 non-null    object 
 2   fuel_type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num_of_doors       205 non-null    object 
 5   body_style         205 non-null    object 
 6   drive_wheels       205 non-null    object 
 7   engine_location    205 non-null    object 
 8   wheel_base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb_weight        205 non-null    int64  
 13  engine_type        205 non-null    object 
 14  num_of_cylinders   205 non-null    object 
 15  engine_size        205 non-null    int64  
 16  fuel_system        205 non

In [16]:
# access columns

cars_df.columns

Index(['Unnamed: 0', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'price'],
      dtype='object')

In [27]:
cars_df.describe() # picks all the numeric columns and will not cond=sider the ojects ie.text columns

Unnamed: 0.1,Unnamed: 0,wheel_base,length,width,height,curb_weight,engine_size,compression_ratio,city_mpg,highway_mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,59.322565,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,1.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,52.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,103.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,154.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,205.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [18]:
cars_df.dtypes # access dtype properties

Unnamed: 0             int64
make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
price                 object
dtype: object

### Accessing the dataframe content : Indexing and selecting data



- `DataFrame[ ]` : [ ] indexing operator
- `DataFrame.loc[ ]` : used for labels
- `DataFrame.iloc[ ]` : used for positions or integer based

### Using Data Frame [  ]

In [32]:
#Access a column in the df

cars_df['make'].head() # pandas core data series

0    alfa-romero
1    alfa-romero
2    alfa-romero
3           audi
4           audi
Name: make, dtype: object

In [20]:
cars_df[['make']].head() # pandas data frame

Unnamed: 0,make
0,alfa-romero
1,alfa-romero
2,alfa-romero
3,audi
4,audi


In [21]:
cars_df[['make','fuel_type','aspiration','num_of_doors']].head()

Unnamed: 0,make,fuel_type,aspiration,num_of_doors
0,alfa-romero,gas,std,two
1,alfa-romero,gas,std,two
2,alfa-romero,gas,std,two
3,audi,gas,std,four
4,audi,gas,std,four


In [22]:
# get teh make column

cars_df.make

0      alfa-romero
1      alfa-romero
2      alfa-romero
3             audi
4             audi
          ...     
200          volvo
201          volvo
202          volvo
203          volvo
204          volvo
Name: make, Length: 205, dtype: object

In [29]:
# get all unique values from make column

cars_df.make.unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
       'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',
       'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

### Using DataFrame.loc[  ]


In [33]:
df1.loc['IND']

country           India
capital       New Delhi
area              3.286
population         1252
Name: IND, dtype: object

In [31]:
#In order to select multiple rows, we put all the row labels in a list . here u enable all columns coz its not mentioned

df1.loc[['IND','SA']]



Unnamed: 0,country,capital,area,population
IND,India,New Delhi,3.286,1252.0
SA,South Africa,Pretoria,1.221,52.98


## Note: can use the above method or the below 

In [32]:
#Selecting all of teh rows and some columns. since ur passing the input as mxn u gotta type col names

df1.loc[['IND','SA'],['country','capital','area','population']]

df1.loc[['IND','SA'] ,:]

Unnamed: 0,country,capital,area,population
IND,India,New Delhi,3.286,1252.0
SA,South Africa,Pretoria,1.221,52.98


Unnamed: 0,country,capital,area,population
IND,India,New Delhi,3.286,1252.0
SA,South Africa,Pretoria,1.221,52.98


### using data frame

In [36]:
cars_df.iloc[[200]]

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
200,201,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,23,28,16845


In [34]:
cars_df.iloc[[1,3,5,7]]

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
1,2,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
3,4,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950
5,6,audi,gas,std,two,sedan,fwd,front,99.8,177.3,...,ohc,five,136,mpfi,8.5,110,5500,19,25,15250
7,8,audi,gas,std,four,wagon,fwd,front,105.8,192.7,...,ohc,five,136,mpfi,8.5,110,5500,19,25,18920


In [35]:
cars_df.iloc[[4,6],[2,9]]

Unnamed: 0,fuel_type,length
4,gas,176.6
6,gas,192.7


In [36]:
cars_df.iloc[:, [5,6]]  

Unnamed: 0,body_style,drive_wheels
0,convertible,rwd
1,convertible,rwd
2,hatchback,rwd
3,sedan,fwd
4,sedan,4wd
...,...,...
200,sedan,rwd
201,sedan,rwd
202,sedan,rwd
203,sedan,rwd


In [37]:
cars_df.iloc[1:4,2:4] # using range function

Unnamed: 0,fuel_type,aspiration
1,gas,std
2,gas,std
3,gas,std


In [38]:
# use list for specific index numbers

# when u need a sequence u dont need a list u just need to use range.

# iloc[: , :] ------------- range
# iloc[1:4 , 2:4] -----------------also range 
# iloc[[2,3] , [2,6]] ----------------- use list as u need a list of specified rows n columns .
# iloc[: , [5,6]]------------------ getting all rows, note u dont need a list i just gave it in range format
#                       ... and for columns i want specific colu,ns so i gacve a list of specified column hence i used a list

In [39]:
# display all rows where manufacturer is audi
cars_df.make == 'audi'



0      False
1      False
2      False
3       True
4       True
       ...  
200    False
201    False
202    False
203    False
204    False
Name: make, Length: 205, dtype: bool

In [40]:
cars_df[cars_df.make == 'audi']  # df[df.col == 'audi']

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,4,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950
4,5,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,ohc,five,136,mpfi,8.0,115,5500,18,22,17450
5,6,audi,gas,std,two,sedan,fwd,front,99.8,177.3,...,ohc,five,136,mpfi,8.5,110,5500,19,25,15250
6,7,audi,gas,std,four,sedan,fwd,front,105.8,192.7,...,ohc,five,136,mpfi,8.5,110,5500,19,25,17710
7,8,audi,gas,std,four,wagon,fwd,front,105.8,192.7,...,ohc,five,136,mpfi,8.5,110,5500,19,25,18920
8,9,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,...,ohc,five,131,mpfi,8.3,140,5500,17,20,23875
9,10,audi,gas,turbo,two,hatchback,4wd,front,99.5,178.2,...,ohc,five,131,mpfi,7.0,160,5500,16,22,?


In [41]:
# display all rows where wheelbase is >115

cars_df[cars_df.wheel_base > 115]

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
70,71,mercedes-benz,diesel,turbo,four,sedan,rwd,front,115.6,202.6,...,ohc,five,183,idi,21.5,123,4350,22,25,31600
71,72,mercedes-benz,gas,std,four,sedan,rwd,front,115.6,202.6,...,ohcv,eight,234,mpfi,8.3,155,4750,16,18,34184
73,74,mercedes-benz,gas,std,four,sedan,rwd,front,120.9,208.1,...,ohcv,eight,308,mpfi,8.0,184,4500,14,16,40960


In [42]:
cars_df[(cars_df.aspiration == 'turbo') & (cars_df.num_of_cylinders == 'five')]

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
8,9,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,...,ohc,five,131,mpfi,8.3,140,5500,17,20,23875
9,10,audi,gas,turbo,two,hatchback,4wd,front,99.5,178.2,...,ohc,five,131,mpfi,7.0,160,5500,16,22,?
67,68,mercedes-benz,diesel,turbo,four,sedan,rwd,front,110.0,190.9,...,ohc,five,183,idi,21.5,123,4350,22,25,25552
68,69,mercedes-benz,diesel,turbo,four,wagon,rwd,front,110.0,190.9,...,ohc,five,183,idi,21.5,123,4350,22,25,28248
69,70,mercedes-benz,diesel,turbo,two,hardtop,rwd,front,106.7,187.5,...,ohc,five,183,idi,21.5,123,4350,22,25,28176
70,71,mercedes-benz,diesel,turbo,four,sedan,rwd,front,115.6,202.6,...,ohc,five,183,idi,21.5,123,4350,22,25,31600


In [43]:
cars_df[(cars_df.aspiration == 'turbo') & (cars_df.num_of_cylinders == 'five')].iloc[:,[2,5,7,9]]

Unnamed: 0,fuel_type,body_style,engine_location,length
8,gas,sedan,front,192.7
9,gas,hatchback,front,178.2
67,diesel,sedan,front,190.9
68,diesel,wagon,front,190.9
69,diesel,hardtop,front,187.5
70,diesel,sedan,front,202.6


In [44]:
# display all rows where manufactire is bmw or  wheel base >110

cars_df[(cars_df.make == 'bmw') | (cars_df.wheel_base > 110)]

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
10,11,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,...,ohc,four,108,mpfi,8.8,101,5800,23,29,16430
11,12,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,...,ohc,four,108,mpfi,8.8,101,5800,23,29,16925
12,13,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,...,ohc,six,164,mpfi,9.0,121,4250,21,28,20970
13,14,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,...,ohc,six,164,mpfi,9.0,121,4250,21,28,21105
14,15,bmw,gas,std,four,sedan,rwd,front,103.5,189.0,...,ohc,six,164,mpfi,9.0,121,4250,20,25,24565
15,16,bmw,gas,std,four,sedan,rwd,front,103.5,189.0,...,ohc,six,209,mpfi,8.0,182,5400,16,22,30760
16,17,bmw,gas,std,two,sedan,rwd,front,103.5,193.8,...,ohc,six,209,mpfi,8.0,182,5400,16,22,41315
17,18,bmw,gas,std,four,sedan,rwd,front,110.0,197.0,...,ohc,six,209,mpfi,8.0,182,5400,15,20,36880
47,48,jaguar,gas,std,four,sedan,rwd,front,113.0,199.6,...,dohc,six,258,mpfi,8.1,176,4750,15,19,32250
48,49,jaguar,gas,std,four,sedan,rwd,front,113.0,199.6,...,dohc,six,258,mpfi,8.1,176,4750,15,19,35550


### Few more operations on Data frames

In [45]:
#Renaming columns

cars_df.columns

Index(['Unnamed: 0', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'price'],
      dtype='object')

In [46]:
cars_df.rename(columns = {'Unnamed: 0':'Index','price':'model_price'}, inplace= True)

In [47]:
cars_df.columns

Index(['Index', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'model_price'],
      dtype='object')

In [38]:
#creating a back up of the file

cars_df.bk = cars_df.copy()

  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
cars_df.bk

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,13495
1,2,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
2,3,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,ohcv,six,152,mpfi,9.0,154,5000,19,26,16500
3,4,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950
4,5,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,ohc,five,136,mpfi,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,9.5,114,5400,23,28,16845
201,202,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,four,141,mpfi,8.7,160,5300,19,25,19045
202,203,volvo,gas,std,four,sedan,rwd,front,109.1,188.8,...,ohcv,six,173,mpfi,8.8,134,5500,18,23,21485
203,204,volvo,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,ohc,six,145,idi,23.0,106,4800,26,27,22470


In [50]:
# if there are only few column names and if u wanna change the col names

df1.columns

Index(['country', 'capital', 'area', 'population'], dtype='object')

In [51]:
newcol = ['Country', 'Capital', 'Area', 'Population']

df1.columns = newcol

In [52]:
df1.columns

Index(['Country', 'Capital', 'Area', 'Population'], dtype='object')

In [53]:
#drop a column

list_drop = ['Index']
cars_df.drop(list_drop, axis=1, inplace=True)


In [54]:
cars_df.columns

Index(['make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style',
       'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width',
       'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'model_price'],
      dtype='object')

In [55]:
cars_df.bk.columns

Index(['Index', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'model_price'],
      dtype='object')

In [56]:
cars_df.head(4)

Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,model_price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,ohcv,six,152,mpfi,9.0,154,5000,19,26,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950


In [57]:
cars_df.drop([2,3], axis=0, inplace=True)

In [58]:
cars_df.head(4)

Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,model_price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,ohc,five,136,mpfi,8.0,115,5500,18,22,17450
5,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,...,ohc,five,136,mpfi,8.5,110,5500,19,25,15250


In [59]:
cars_sort = cars_df[['make', 'fuel_type', 'aspiration','wheel_base']]
cars_sort

Unnamed: 0,make,fuel_type,aspiration,wheel_base
0,alfa-romero,gas,std,88.6
1,alfa-romero,gas,std,88.6
4,audi,gas,std,99.4
5,audi,gas,std,99.8
6,audi,gas,std,105.8
...,...,...,...,...
200,volvo,gas,std,109.1
201,volvo,gas,turbo,109.1
202,volvo,gas,std,109.1
203,volvo,diesel,turbo,109.1


In [60]:
cars_sort.sort_values(['make', 'fuel_type', 'aspiration','wheel_base'],axis=0, ascending=True, inplace=False)

Unnamed: 0,make,fuel_type,aspiration,wheel_base
0,alfa-romero,gas,std,88.6
1,alfa-romero,gas,std,88.6
4,audi,gas,std,99.4
5,audi,gas,std,99.8
6,audi,gas,std,105.8
...,...,...,...,...
202,volvo,gas,std,109.1
198,volvo,gas,turbo,104.3
199,volvo,gas,turbo,104.3
201,volvo,gas,turbo,109.1


In [61]:
cars_df.columns

Index(['make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style',
       'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width',
       'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'model_price'],
      dtype='object')

In [62]:
cars_df.dtypes

make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
model_price           object
dtype: object

In [63]:
#cars_df.model_price.astype(int)

In [64]:
cars_df.model_price.unique()

array(['13495', '16500', '17450', '15250', '17710', '18920', '23875', '?',
       '16430', '16925', '20970', '21105', '24565', '30760', '41315',
       '36880', '5151', '6295', '6575', '5572', '6377', '7957', '6229',
       '6692', '7609', '8558', '8921', '12964', '6479', '6855', '5399',
       '6529', '7129', '7295', '7895', '9095', '8845', '10295', '12945',
       '10345', '6785', '11048', '32250', '35550', '36000', '5195',
       '6095', '6795', '6695', '7395', '10945', '11845', '13645', '15645',
       '8495', '10595', '10245', '10795', '11245', '18280', '18344',
       '25552', '28248', '28176', '31600', '34184', '35056', '40960',
       '45400', '16503', '5389', '6189', '6669', '7689', '9959', '8499',
       '12629', '14869', '14489', '6989', '8189', '9279', '5499', '7099',
       '6649', '6849', '7349', '7299', '7799', '7499', '7999', '8249',
       '8949', '9549', '13499', '14399', '17199', '19699', '18399',
       '11900', '13200', '12440', '13860', '15580', '16900', '16695',


In [65]:
cars_df[cars_df.model_price.str.contains('\?')]

Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,model_price
9,audi,gas,turbo,two,hatchback,4wd,front,99.5,178.2,67.9,...,ohc,five,131,mpfi,7.0,160,5500,16,22,?
44,isuzu,gas,std,two,sedan,fwd,front,94.5,155.9,63.6,...,ohc,four,90,2bbl,9.6,70,5400,38,43,?
45,isuzu,gas,std,four,sedan,fwd,front,94.5,155.9,63.6,...,ohc,four,90,2bbl,9.6,70,5400,38,43,?
129,porsche,gas,std,two,hatchback,rwd,front,98.4,175.7,72.3,...,dohcv,eight,203,mpfi,10.0,288,5750,17,28,?


In [66]:
cars_df.model_price = cars_df.model_price.str.replace('\?','100')

In [67]:
cars_df[cars_df.model_price.str.contains('\?')]  # cars_df[] works fine when the inside conditon gives a boolean output ele it ll throw an erro. anduke replace pani cheyledu coz it doesnt throw boolean output

Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,model_price


In [68]:
cars_df.model_price = cars_df.model_price.astype(int)

In [69]:
cars_df.dtypes

make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
model_price            int64
dtype: object