In [3]:
import pandas as pd
groceries = pd.Series(data=[30,5,'Yes','No'],index=['eggs','apples','milk','bread']) 
groceries
#a panda series is a one-D array like object that can hold many data types. this is different from 
#a numpy array which can only hold one data type. Another difference is that you can assign an index in a panda series.

eggs       30
apples      5
milk      Yes
bread      No
dtype: object

In [4]:
groceries.shape

(4,)

In [5]:
groceries.ndim #gives the number of dimension

1

In [6]:
groceries.size

4

In [8]:
groceries.index

Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')

In [9]:
groceries.values

array([30, 5, 'Yes', 'No'], dtype=object)

In [11]:
'guava' in groceries

False

In [12]:
#Pandas allows to access data in multiple ways
groceries['eggs']

30

In [13]:
groceries[['milk','bread']]

milk     Yes
bread     No
dtype: object

In [14]:
#Another way
groceries[0]

30

In [15]:
#In order to remove ambuiguity from the way of referring, it provides two attributes.
groceries.loc['milk'] #stands for location and its used to explicitly state we are using a labelled index

'Yes'

In [None]:
groceries.iloc[1] #iloc stands for integer location, used to explicitly state that we are using a numerical index.

In [16]:
#panda series are also mutable like numpy
groceries['eggs'] = 5
groceries

eggs        5
apples      5
milk      Yes
bread      No
dtype: object

In [19]:
groceries.drop('apples')

eggs       5
milk     Yes
bread     No
dtype: object

In [20]:
groceries #the drop is out of place

eggs        5
apples      5
milk      Yes
bread      No
dtype: object

In [21]:
groceries.drop('apples',inplace = True)
groceries

eggs       5
milk     Yes
bread     No
dtype: object

### Arithmetic Operations

In [22]:
fruits = pd.Series([10,3,4],['apples','mangoes','banana'])
fruits

apples     10
mangoes     3
banana      4
dtype: int64

In [23]:
fruits + 2

apples     12
mangoes     5
banana      6
dtype: int64

In [24]:
fruits - 2

apples     8
mangoes    1
banana     2
dtype: int64

In [25]:
fruits * 2

apples     20
mangoes     6
banana      8
dtype: int64

In [26]:
fruits / 2

apples     5.0
mangoes    1.5
banana     2.0
dtype: float64

In [27]:
fruits['mangoes'] + 2

5

In [28]:
fruits.iloc[0] - 2

8

In [29]:
#Arithmetic operations can also be applied here, provided the condition that the operation must be perform on each data type
groceries * 2

eggs         10
milk     YesYes
bread      NoNo
dtype: object

In [30]:
groceries + 2

TypeError: can only concatenate str (not "int") to str

### Second Main DataStructure: DataFrame

In [31]:
items = {'Bob' : pd.Series([20,30,40],index=['bike','pants','watch']),
        'Alice':pd.Series([40,110,500,60],index=['books','glasses','bike','pants'])}

In [32]:
type(items)

dict

In [33]:
shopping = pd.DataFrame(items)
shopping
#NaN is a panda way of specifying that there is no any value

Unnamed: 0,Bob,Alice
bike,20.0,500.0
books,,40.0
glasses,,110.0
pants,30.0,60.0
watch,40.0,


In [34]:
shopping.values

array([[ 20., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 30.,  60.],
       [ 40.,  nan]])

In [35]:
shopping.shape

(5, 2)

In [36]:
shopping.ndim

2

In [37]:
bob_shopping = pd.DataFrame(items,columns=['Bob'])
bob_shopping

Unnamed: 0,Bob
bike,20
pants,30
watch,40


In [38]:
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2)

# We display the DataFrame
store_items

Unnamed: 0,bikes,pants,watches,glasses
0,20,30,35,
1,15,5,10,50.0


In [40]:
store_items[['bikes']]

Unnamed: 0,bikes
0,20
1,15


In [41]:
store_items['shirts'] = [15,2]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts
0,20,30,35,,15
1,15,5,10,50.0,2


In [42]:
store_items['suits'] = store_items['shirts'] + store_items['pants']
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
0,20,30,35,,15,45
1,15,5,10,50.0,2,7


In [43]:
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]

new_store = pd.DataFrame(new_items, index = ['store 3'])

new_store

Unnamed: 0,bikes,pants,watches,glasses
store 3,20,30,35,4


In [44]:
store_items = store_items.append(new_store)
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
0,20,30,35,,15.0,45.0
1,15,5,10,50.0,2.0,7.0
store 3,20,30,35,4.0,,


In [45]:
store_items['new_watches'] = store_items['watches'][1:]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits,new_watches
0,20,30,35,,15.0,45.0,
1,15,5,10,50.0,2.0,7.0,10.0
store 3,20,30,35,4.0,,,35.0


In [46]:
store_items.insert(5,'shoes',[8,9,10])
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits,new_watches
0,20,30,35,,15.0,8,45.0,
1,15,5,10,50.0,2.0,9,7.0,10.0
store 3,20,30,35,4.0,,10,,35.0


In [53]:
#to delete, both pop and drop is used.
#pop is used to delete the columns, while drop is used for both
store_items.pop('new_watches')
store_items

KeyError: 'new_watches'

In [52]:
store_items = store_items.drop(['watches','shoes'],axis=1)
store_items #axis=0 for row

KeyError: "['watches' 'shoes'] not found in axis"

In [55]:
store_items = store_items.rename(columns={'bikes':'hats'})
store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
0,20,30,,15.0,45.0
1,15,5,50.0,2.0,7.0
store 3,20,30,4.0,,


In [56]:
store_items = store_items.rename(index={'store 3':'last store'})
store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
0,20,30,,15.0,45.0
1,15,5,50.0,2.0,7.0
last store,20,30,4.0,,


In [57]:
store_items = store_items.set_index('pants')
store_items

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,,15.0,45.0
5,15,50.0,2.0,7.0
30,20,4.0,,


### Dealing with Null Values

In [58]:
x = store_items.isnull()
print(x)

        hats  glasses  shirts  suits
pants                               
30     False     True   False  False
5      False    False   False  False
30     False    False    True   True


In [59]:
x = store_items.isnull().sum().sum()
print(x)

3


In [60]:
x = store_items.count() #it will count the non-null values
print(x)

hats       3
glasses    2
shirts     2
suits      2
dtype: int64


In [61]:
#In general, we can either replace or remove null values
store_items.dropna(axis=0) #this will drop na values from row
#this works out of place, for inplace, Inplace = true

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,15,50.0,2.0,7.0


In [62]:
#replacing
store_items.fillna(0)

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,0.0,15.0,45.0
5,15,50.0,2.0,7.0
30,20,4.0,0.0,0.0


In [64]:
store_items.fillna(method='ffill',axis=0) #this is called forward filling
#where the nan values will get replace to previous values

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,,15.0,45.0
5,15,50.0,2.0,7.0
30,20,4.0,2.0,7.0


In [65]:
#also, backward filling can be used
store_items.fillna(method='backfill',axis=0) #this method also works out of place

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,50.0,15.0,45.0
5,15,50.0,2.0,7.0
30,20,4.0,,


In [66]:
#we can also fill nan values by using interpolation methods
store_items.interpolate(method='linear',axis=0) #out of place.

Unnamed: 0_level_0,hats,glasses,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,20,,15.0,45.0
5,15,50.0,2.0,7.0
30,20,4.0,2.0,7.0


### Loading data into pandas data frame

In [None]:
google_stock = pd.read_csv('./GOOG.csv')
print(type(google_stock))


In [None]:
google_Stock.head() #to print first 5 data

In [None]:
google_stock.tail()

In [None]:
google_Stock.isnull.any()

In [None]:
google_stock.describe()

In [None]:
google_stock['Column'].describe()

In [None]:
google_stock.max()
google_stock.mean()

In [None]:
google_stock.corr() #to get the co-relations between different columns

In [None]:
#we can also use group by function here
data.groupby(['year'])['salary'].sum() #to know the total salary in each year

In [None]:
#similarly, for the average salary
data.groupby(['year'])['salary'].mean() 