In [1]:
import numpy as np
import pandas as pd

In [2]:
label = ['a','b','c']
label

['a', 'b', 'c']

In [3]:
mylist = [10,20,30]
mylist

[10, 20, 30]

In [4]:
arr = np.array(mylist)
arr

array([10, 20, 30])

In [5]:
my_dict={'a':10,'b':20,'c':30}
my_dict

{'a': 10, 'b': 20, 'c': 30}

In [6]:
data = pd.Series(mylist) # list to series int 64
data

0    10
1    20
2    30
dtype: int64

In [7]:
data = pd.Series(arr) # numpy to series int 64
data

0    10
1    20
2    30
dtype: int64

In [8]:
# Index is changed to labels and access through label names
# The major difference between Numpy and Pandas
data = pd.Series(data=arr, index=label) 
data 

a    10
b    20
c    30
dtype: int64

In [9]:
data = pd.Series(data=['apple',2,4.5], index=label) 
data

a    apple
b        2
c      4.5
dtype: object

In [10]:
data['a']

'apple'

In [11]:
ser1=pd.Series([1,2,3,4],['US','IN','Japan','EU'])
ser1

US       1
IN       2
Japan    3
EU       4
dtype: int64

In [12]:
ser2=pd.Series([1,3,5,2],['US','IN','RU','EU'])
ser2

US    1
IN    3
RU    5
EU    2
dtype: int64

In [13]:
ser1 + ser2 # NaN is Not a Number resulting with 4 + NaN

EU       6.0
IN       5.0
Japan    NaN
RU       NaN
US       2.0
dtype: float64

## Convering Numpy array into dataframe

In [14]:
np.random.seed(101)
arr = np.random.randn(5,4)
arr

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [15]:
df = pd.DataFrame(data=arr)
df

Unnamed: 0,0,1,2,3
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [16]:
df = pd.DataFrame(data=arr,index='A B C D E'.split())
df

Unnamed: 0,0,1,2,3
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [17]:
df = pd.DataFrame(data=arr,index='A B C D E'.split(),columns='one two three four'.split())
df

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [18]:
df['one']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: one, dtype: float64

In [19]:
df[['one','two']]

Unnamed: 0,one,two
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [20]:
df['five']=df['one']+df['two']
df

Unnamed: 0,one,two,three,four,five
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [21]:
df.drop('five',axis=1) # axis - 1 removes across columns

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [22]:
df # df.drop does not remove it from data frame 

Unnamed: 0,one,two,three,four,five
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [23]:
df.drop('five',axis=1,inplace=True) # inplace - True actually removes the column
df

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [24]:
df.drop('A')

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
df

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [26]:
df.drop('A',inplace=True)
df

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df.loc['B'] # loc is for location it is for row

one      0.651118
two     -0.319318
three   -0.848077
four     0.605965
Name: B, dtype: float64

In [28]:
df.iloc[1] # iloc is for index location

one     -2.018168
two      0.740122
three    0.528813
four    -0.589001
Name: C, dtype: float64

In [29]:
df.loc[['B','C']]

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


In [30]:
df.iloc[[0,2]]

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [31]:
df.iloc[[0,2],[1,2]]

Unnamed: 0,two,three
B,-0.319318,-0.848077
D,-0.758872,-0.933237


In [32]:
df.loc[['B','C']]

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


In [33]:
df.loc[['B','C']][['three','four']]

Unnamed: 0,three,four
B,-0.848077,0.605965
C,0.528813,-0.589001


In [34]:
df

Unnamed: 0,one,two,three,four
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
df = pd.DataFrame(data=arr,index='A B C D E'.split(),columns='one two three four'.split())
df

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
df_bool = df > 0
df_bool

Unnamed: 0,one,two,three,four
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [37]:
df[df_bool] # Accessing only positive numbers from dataframe

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
df[df>0.5] # another way

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,,,,0.955057
E,,1.978757,2.605967,0.683509


In [39]:
df[df['one']>0]

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [40]:
df[df['one']>0]['one']

A    2.706850
B    0.651118
D    0.188695
E    0.190794
Name: one, dtype: float64

In [41]:
cond1 = df['one']>0
cond1

A     True
B     True
C    False
D     True
E     True
Name: one, dtype: bool

In [42]:
cond2 = df['three']>0
cond2

A     True
B    False
C     True
D    False
E     True
Name: three, dtype: bool

In [43]:
df[ cond1 & cond2 ] # Note: and and or dooes not apply on pandas series, so use & and | pipe operator

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509


In [44]:
df[(cond1)&(cond2)][['two','three']]

Unnamed: 0,two,three
A,0.628133,0.907969
E,1.978757,2.605967


In [45]:
df[(cond1)&(cond2)][['two','three']].iloc[0]

two      0.628133
three    0.907969
Name: A, dtype: float64

In [46]:
df[(cond1)&(cond2)][['two','three']].iloc[0][0]

0.6281327087844596

In [47]:
df = pd.DataFrame(data=arr,index='A B C D E'.split(),columns='one two three four'.split())
df

Unnamed: 0,one,two,three,four
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [48]:
df.reset_index(inplace=True) # reseting original index
df

Unnamed: 0,index,one,two,three,four
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [49]:
df.set_index(['US JP IN EU CH'.split()],inplace=True) # Setting a new index names
df

Unnamed: 0,index,one,two,three,four
US,A,2.70685,0.628133,0.907969,0.503826
JP,B,0.651118,-0.319318,-0.848077,0.605965
IN,C,-2.018168,0.740122,0.528813,-0.589001
EU,D,0.188695,-0.758872,-0.933237,0.955057
CH,E,0.190794,1.978757,2.605967,0.683509


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, US to CH
Data columns (total 5 columns):
index    5 non-null object
one      5 non-null float64
two      5 non-null float64
three    5 non-null float64
four     5 non-null float64
dtypes: float64(4), object(1)
memory usage: 240.0+ bytes


In [51]:
df.dtypes

index     object
one      float64
two      float64
three    float64
four     float64
dtype: object

In [52]:
df.describe()

Unnamed: 0,one,two,three,four
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


In [53]:
df2 = df['one'] >0
df2

US     True
JP     True
IN    False
EU     True
CH     True
Name: one, dtype: bool

In [54]:
df2.value_counts() # gives counts in each category

True     4
False    1
Name: one, dtype: int64

In [55]:
df2.values # gets values and put it into numpy

array([ True,  True, False,  True,  True])

## Grouping bt groupby

In [97]:
data = {'Company':'AA AA FB FB MS MS'.split(),'Person':'Sara Jon Math Roy Jini Joy'.split(),'Sales':[100, 200, 300, 400, 500, 600]}
data

{'Company': ['AA', 'AA', 'FB', 'FB', 'MS', 'MS'],
 'Person': ['Sara', 'Jon', 'Math', 'Roy', 'Jini', 'Joy'],
 'Sales': [100, 200, 300, 400, 500, 600]}

In [98]:
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,AA,Sara,100
1,AA,Jon,200
2,FB,Math,300
3,FB,Roy,400
4,MS,Jini,500
5,MS,Joy,600


In [101]:
df['Sales']

0    100
1    200
2    300
3    400
4    500
5    600
Name: Sales, dtype: int64

In [66]:
df.groupby('Company') # without aggregation it doesnt show anything

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f42410091d0>

In [102]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AA,2.0,150.0,70.710678,100.0,125.0,150.0,175.0,200.0
FB,2.0,350.0,70.710678,300.0,325.0,350.0,375.0,400.0
MS,2.0,550.0,70.710678,500.0,525.0,550.0,575.0,600.0


In [103]:
df.groupby('Company').describe().transpose()

Unnamed: 0,Company,AA,FB,MS
Sales,count,2.0,2.0,2.0
Sales,mean,150.0,350.0,550.0
Sales,std,70.710678,70.710678,70.710678
Sales,min,100.0,300.0,500.0
Sales,25%,125.0,325.0,525.0
Sales,50%,150.0,350.0,550.0
Sales,75%,175.0,375.0,575.0
Sales,max,200.0,400.0,600.0


In [87]:
df

Unnamed: 0,Company,Person,Sales
0,AA,Sara,100
1,AA,Jon,200
2,FB,Math,300
3,FB,Roy,400
4,MS,Jini,500
5,MS,Joy,600


In [88]:
df['Company'].unique() # gives unique value in dataframe

array(['AA', 'FB', 'MS'], dtype=object)

In [89]:
df['Company'].nunique() # gives unique value numbers in dataframe

3

In [90]:
df['Company'].value_counts()

AA    2
MS    2
FB    2
Name: Company, dtype: int64

In [107]:
newdf = df[ (df['Company']=='AA') & (df['Sales'] > 100) ]
newdf

Unnamed: 0,Company,Person,Sales
1,AA,Jon,200


In [111]:
## Applying function two any columns of Panda
def doubleit(number):
    return number * 2
doubleit(5)  

10

In [112]:
df['Sales'].apply(doubleit) # note : Do not add () in function name

0     200
1     400
2     600
3     800
4    1000
5    1200
Name: Sales, dtype: int64

In [113]:
df['double']=df['Sales'].apply(doubleit) # note : Do not add () in function name
df

Unnamed: 0,Company,Person,Sales,double
0,AA,Sara,100,200
1,AA,Jon,200,400
2,FB,Math,300,600
3,FB,Roy,400,800
4,MS,Jini,500,1000
5,MS,Joy,600,1200


In [114]:
del df['double']
df

Unnamed: 0,Company,Person,Sales
0,AA,Sara,100
1,AA,Jon,200
2,FB,Math,300
3,FB,Roy,400
4,MS,Jini,500
5,MS,Joy,600


In [115]:
df.columns # gives columns names

Index(['Company', 'Person', 'Sales'], dtype='object')

In [116]:
df.index # gives index names

RangeIndex(start=0, stop=6, step=1)

In [118]:
df.info() # gives info memory usafe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
Company    6 non-null object
Person     6 non-null object
Sales      6 non-null int64
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


In [120]:
df.sort_values('Sales',ascending=False)

Unnamed: 0,Company,Person,Sales
5,MS,Joy,600
4,MS,Jini,500
3,FB,Roy,400
2,FB,Math,300
1,AA,Jon,200
0,AA,Sara,100


In [125]:
df.sort_values('Person',ascending=True,inplace=True)

In [126]:
df

Unnamed: 0,Company,Person,Sales
4,MS,Jini,500
1,AA,Jon,200
5,MS,Joy,600
2,FB,Math,300
3,FB,Roy,400
0,AA,Sara,100


In [139]:
del df['level_0']

In [141]:
del df['index']

In [142]:
df

Unnamed: 0,Company,Person,Sales
0,MS,Jini,500
1,AA,Jon,200
2,MS,Joy,600
3,FB,Math,300
4,FB,Roy,400
5,AA,Sara,100


In [143]:
pwd

'/home/naitik147896/Naitik/Pytorch for deep learning/Excercise'

In [145]:
df = pd.read_csv('/home/naitik147896/Naitik/Pytorch for deep learning/PYTORCH_NOTEBOOKS/00-Crash-Course-Topics/01-Crash-Course-Pandas/example.csv')
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
