# Pandas(Python Data Analysis Library)
- Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
- Using pandas we can replicate any SQL Query
- Data wrangling, sometimes referred to as data munging, is the process of transforming and mapping data from one "raw" data form into another format with the intent of making it more appropriate and valuable for a variety of downstream purposes such as analytics.


## 1.Load CSV files into DF

In [67]:
import os

print(os.getcwd())
os.chdir("E:/code/5.DataAnalysisOfficial/data/pandas")
print(os.getcwd())

E:\code\5.DataAnalysisOfficial\data\pandas
E:\code\5.DataAnalysisOfficial\data\pandas


In [68]:
import pandas as pd
df = pd.read_csv("pandas_sales.csv")
df

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [69]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [135]:
df = pd.read_csv("pandas_sales.csv",index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [136]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

In [137]:
df.columns

Index(['eggs', 'salt', 'spam'], dtype='object')

In [139]:
df.size

18

In [142]:
df.memory_usage()   #in bytes

Index    48
eggs     48
salt     48
spam     48
dtype: int64

In [73]:
type(df)

pandas.core.frame.DataFrame

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Jan to Jun
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   eggs    6 non-null      int64  
 1   salt    5 non-null      float64
 2   spam    6 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 192.0+ bytes


## 2. Reading DataFrame

In [75]:
import pandas as pd
df=pd.read_csv("pandas_sales.csv",index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


### method1: Read DF  byindex -- [column][row]

In [76]:
#print 47
df["eggs"]["Jan"]

47

In [77]:
#print 52
df["spam"]["May"]

52

#### Note: df["Jan"]["eggs"]  --->   [row][column] is not possible

### method2:Read DF  by column attribute and row index 

In [78]:
import pandas as pd
df=pd.read_csv("pandas_sales.csv",index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [79]:
df.columns

Index(['eggs', 'salt', 'spam'], dtype='object')

In [80]:
#df.salt
df['salt']

month
Jan    12.0
Feb    50.0
Mar    89.0
Apr    87.0
May     NaN
Jun    60.0
Name: salt, dtype: float64

In [81]:
#print 47
df.eggs[0]

47

In [82]:
#slice[]
#Here it returns the series
df.eggs[0:3]

month
Jan     47
Feb    110
Mar    221
Name: eggs, dtype: int64

In [83]:
#Fancy
#here also it returns Series
df.eggs[[1,3,5]]

month
Feb    110
Apr     77
Jun    205
Name: eggs, dtype: int64

In [84]:
a=df.eggs[[1,3,5]]
type(a)

pandas.core.series.Series

### method3:Read DF  by loc(Label)
- df.loc[row]   --> Series
- df.loc[[row]] --> DF
- df.loc[['row1','row2']]  --> DF
- df.loc['row1':'row2']  -->  DF
- df.loc[:,'col1']   # for col access rows are mandatory
- df.loc[:,['col1','col2']]
- df.loc[:,'col1':'col2']

In [85]:
#df.loc?

#### Example1:

In [86]:
import pandas as pd
df=pd.read_csv("pandas_sales.csv",index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [87]:
df.loc['Jan','eggs']

47

In [88]:
df.loc["Jan":"Mar","salt"]      #slice

month
Jan    12.0
Feb    50.0
Mar    89.0
Name: salt, dtype: float64

In [89]:
df.loc["Jan":"Mar","eggs":"spam"]   

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72


In [90]:
df.loc[["Jan","Mar"],["eggs","spam"]] #fancy

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Mar,221,72


#### Example2

In [91]:
df=pd.DataFrame([[10,20],[30,40],[50,60]],
               index=["cobra","viper","sidewinder"],
               columns=["max_speed","shield"])
df

Unnamed: 0,max_speed,shield
cobra,10,20
viper,30,40
sidewinder,50,60


In [92]:
#Single label. Note: this returns the row as a Series.
#if we use single label it returns Series
s1 = df.loc["cobra"]
print(type(s1))
s1

<class 'pandas.core.series.Series'>


max_speed    10
shield       20
Name: cobra, dtype: int64

In [93]:
 #if we use list of label it returns DataFrame
df1=df.loc[["cobra"]] 
print(type(df1))
df1


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,max_speed,shield
cobra,10,20


In [94]:
df['max_speed']        #column wise

cobra         10
viper         30
sidewinder    50
Name: max_speed, dtype: int64

In [95]:
 df[['max_speed']]     

Unnamed: 0,max_speed
cobra,10
viper,30
sidewinder,50


#### 2.2

In [96]:
#print value 40                
df.loc["viper","max_speed"]      #df.loc["row","column"]

30

In [97]:
df.loc["viper"]           #by default it will takes only row if we provide the column then
                             #KeyError:-'the label [max_speed] is not in the [index]'

max_speed    30
shield       40
Name: viper, dtype: int64

In [98]:
df.loc["cobra":"sidewinder","max_speed"]

cobra         10
viper         30
sidewinder    50
Name: max_speed, dtype: int64

In [99]:
df.loc["cobra","max_speed":"shield"]

max_speed    10
shield       20
Name: cobra, dtype: int64

#### Boolean list with the same length as the row axis

In [100]:
df.loc[[False, True, True]]

Unnamed: 0,max_speed,shield
viper,30,40
sidewinder,50,60


#### Conditional that returns a boolean Series

In [101]:
df.loc[df['shield'] > 20]

Unnamed: 0,max_speed,shield
viper,30,40
sidewinder,50,60


### Method 4: by using iloc (if possible use this)

In [102]:
#df.iloc?

In [103]:
import pandas as pd

my_dict = [{'a':1,'b':2,'c':3,'d':4},
          {'a':10,'b':20,'c':30,'d':40},
          {'a':100,'b':200,'c':300,'d':400}]

In [104]:
df = pd.DataFrame(my_dict)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,10,20,30,40
2,100,200,300,400


In [105]:
df.iloc[0]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [106]:
type(df.iloc[0])

pandas.core.series.Series

In [107]:
df.iloc[[0]]

Unnamed: 0,a,b,c,d
0,1,2,3,4


In [108]:
type(df.iloc[[0]])

pandas.core.frame.DataFrame

In [109]:
df.iloc[[0,1]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,10,20,30,40


In [110]:
df.iloc[0:3]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,10,20,30,40
2,100,200,300,400


In [111]:
df.iloc[[True, False, True]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,100,200,300,400


In [112]:
df.iloc[lambda x: x.index%2 == 0]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,100,200,300,400


In [113]:
df.iloc[0,:]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [114]:
df.iloc[[0,2],:]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,100,200,300,400


## 3.Create Sub - DataFrame from another DF

In [115]:
#sql : create table sub_df as select eggs,salt from df
df = pd.read_csv("pandas_sales.csv",index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [116]:
# it occupies separate memory nothing deep copy
sub_df = df[["salt","eggs"]]
sub_df

Unnamed: 0_level_0,salt,eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,47
Feb,50.0,110
Mar,89.0,221
Apr,87.0,77
May,,132
Jun,60.0,205


## 4.Create Series(1 column+index) from DataFrame[]

In [117]:
Series_eggs = df["eggs"]
Series_eggs

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

## Understanding: ML needs input(X(set of columns)) & output(y(only one column))

#### Example1

In [118]:
import pandas as pd

df = pd.read_csv("pima-indians-diabetes.txt")
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [119]:
df.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [120]:
#X = df[["pregnancies","glucose","diastolic","triceps","insulin","bmi","dpf","age"]]  or     X = df.iloc[:,:-1].values 
X = df.loc[:,df.columns!="diabetes"]      #df.iloc[:,:-1]
type(X)

pandas.core.frame.DataFrame

In [121]:
X = X.values
type(X)

numpy.ndarray

In [122]:
y=df["diabetes"]          # y = df.iloc[:,-1].values
type(y)

pandas.core.series.Series

In [123]:
y = y.values
type(y)

numpy.ndarray

In [124]:
y[:10]

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1], dtype=int64)

#### Example2

In [125]:
import pandas as pd

df = pd.read_csv("iris.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [126]:
X = df.loc[:,df.columns!="species"]           # X = df.iloc[:,:-1].values
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [127]:
type(X)

pandas.core.frame.DataFrame

In [128]:
y=df["species"]                                  # y=df.iloc[:,-1].values
y.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [129]:
type(y)

pandas.core.series.Series

In [130]:
y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [131]:
#pd.DataFrame?

### Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html
- refer doc on loc
- refer doc on iloc 