In [35]:
import pandas as pd
import numpy as np                      # Numerical Python
from numpy.random import randn          # To generate random number

### DataFrames : 2-D array

In [31]:
### pd.DataFrame(data=, index=, columns=, dtype=, copy)

In [11]:
np.random.seed(101)                     

# We get same set of numbers
# Here 101 is any random number
# 0, 7, 42, 100, 101, 200 etc are more frequently used but no difference behind the scene

In [15]:
# randn(row, column)
# Creates random values with Guassian Distribution

df = pd.DataFrame(data=randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])   # 5 rows and 4 columns
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [23]:
# Create Dataframe from Lists

list_1 = [1,2,3,4]

df_list = pd.DataFrame(data = list_1, index=['1','2','3','4'], columns=['col_1'])
df_list

Unnamed: 0,col_1
1,1
2,2
3,3
4,4


In [28]:
# Create Dataframes from Dictionaries

dict_1 = {'K1':[10,20,30],'K2':[40,50,60]}

df_dict = pd.DataFrame(data = dict_1)          # Keys will be column names if not provided explicitly

df_dict

Unnamed: 0,K1,K2
0,10,40
1,20,50
2,30,60


### Series : 1-D array

In [36]:
# pd.Series(data=, index=, columns=, dtype=, copy)

In [33]:
pd.Series(data = list_1)

0    1
1    2
2    3
3    4
dtype: int64

In [30]:
pd.Series(data = dict_1)                       # Subset of Dataframe with only one Column

K1    [10, 20, 30]
K2    [40, 50, 60]
dtype: object

### Appending Dataframes

In [46]:
df1 = pd.DataFrame(data=randn(3,2),columns=['A','B'])
df2 = pd.DataFrame(data=randn(2,2),columns=['B','C'])
df3 = pd.DataFrame(data=randn(2,2),columns=['C','D'])


print(df1.append(df3))
print('\n')
print(df1.append(df2))

                                              

          A         B         C         D
0  0.327845  0.674485       NaN       NaN
1 -0.174057  0.780140       NaN       NaN
2 -0.383258 -0.409318       NaN       NaN
0       NaN       NaN -0.971393 -1.522333
1       NaN       NaN  1.133703  0.528187


          A         B         C
0  0.327845  0.674485       NaN
1 -0.174057  0.780140       NaN
2 -0.383258 -0.409318       NaN
0       NaN  0.343539  0.196275
1       NaN -0.982776  2.231555


### Subsetting the data using Conditions

In [50]:
df1[df1['A']>0]                          # A greater than 0

Unnamed: 0,A,B
0,0.327845,0.674485


In [51]:
df1[df1['A']>0]['B']                     # A greater than 0 and returns only B column

0    0.674485
Name: B, dtype: float64

In [53]:
df1[(df1['A']<0) & (df1['B']>0)]         # AND - Multiple conditions while subsetting

Unnamed: 0,A,B
1,-0.174057,0.78014


In [55]:
df1[(df1['A']<0) | (df1['B']>0)]         # OR - Multiple conditions while subsetting - OR

Unnamed: 0,A,B
0,0.327845,0.674485
1,-0.174057,0.78014
2,-0.383258,-0.409318


### String Handling 

In [68]:
str = 'CA IN SL'

country = str.split(' ')
country

['CA', 'IN', 'SL']

In [74]:
df1['Country'] = country                                      # Length of list should match the shape of Dataframe
df1

Unnamed: 0,A,B,Country
0,0.327845,0.674485,CA
1,-0.174057,0.78014,IN
2,-0.383258,-0.409318,SL


In [75]:
df1.drop('Country', axis = 1, inplace = True)
df1

Unnamed: 0,A,B
0,0.327845,0.674485
1,-0.174057,0.78014
2,-0.383258,-0.409318


# Merging Joining and Concatenation

### Concatenation

In [79]:
"""
pandas.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, 
          levels=None, names=None, verify_integrity=False, sort=None, copy=True)
"""

"\nconcat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, \n          levels=None, names=None, verify_integrity=False, sort=None, copy=True)\n"

In [80]:
# pd.concat([LIST])                     

# By default axis = 0 and it concatenates row-vise
# Similar to APPEND but in Append, we can pass only 1 Dataframe append to other
# Better option we have different column names in the Dataframe

### Merge

In [81]:
# pd.merge(left,right,how = 'inner', on = 'key')

In [91]:
left_df = pd.DataFrame({'key':['K0','K1','K2','K2'],
       'A':['A0','A1','A2','A3'],
       'B':['B0','B1','B2','B3']})

right_df = pd.DataFrame({'key':['K0','K0','K0','K0'],
       'C':['C0','C1','C2','C3'],
       'D':['D0','D1','D2','D3']})

In [92]:
pd.merge(left_df, right_df, how = 'inner', on = ['key'])

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K0,A0,B0,C1,D1
2,K0,A0,B0,C2,D2
3,K0,A0,B0,C3,D3


In [93]:
pd.merge(left_df, right_df, how = 'outer', on = ['key'])

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K0,A0,B0,C1,D1
2,K0,A0,B0,C2,D2
3,K0,A0,B0,C3,D3
4,K1,A1,B1,,
5,K2,A2,B2,,
6,K2,A3,B3,,


### JOINING Dataframes

In [99]:
# Join works on Indexes
# 2 Dataframe can't be joined if they any common column name

In [101]:
left_df = pd.DataFrame(
    {
       'A':['A0','A1','A2','A3'],
       'B':['B0','B1','B2','B3']
    })

right_df = pd.DataFrame(
    {
       'C':['C0','C1','C2','C3'],
       'D':['D0','D1','D2','D3']
    })

In [103]:
left_df.join(right_df)                       # Default is Inner Join (using Indxes)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [105]:
left_df.join(right_df, how = 'outer')

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


### Arrays and Series

In [107]:
np_array = np.array(['India','US'])
np_array

array(['India', 'US'], dtype='<U5')

In [108]:
pd.Series(np_array)

0    India
1       US
dtype: object

In [113]:
pd.Series(5 , index = ['a','b','c'])

a    5
b    5
c    5
dtype: int64

In [117]:
first_vector_series = pd.Series([10,20,30],index = ['a','b','c'])
second_vector_series = pd.Series([1,2,3],index = ['b','c','d'])

In [118]:
# Sum of vectors
first_vector_series + second_vector_series

a     NaN
b    21.0
c    32.0
d     NaN
dtype: float64

### Imputing and Removing NULL

In [120]:
sum = first_vector_series.add(second_vector_series, fill_value=0)
sum

a    10.0
b    21.0
c    32.0
d     3.0
dtype: float64

### GROUP BY

In [121]:
# grouped = df.groupby('col_name')
# grp_data = grouped.get_group('data_element')

### Data Standardization

In [132]:
# Data should be standardize when we have very distorted data or we have any wiered Histogram

df = pd.DataFrame(randn(5,2),columns = ['A','scores'])
df

Unnamed: 0,A,scores
0,-0.380104,-1.666059
1,-2.736995,1.522562
2,0.178009,-0.626805
3,-0.391089,1.743477
4,1.130018,0.897796


In [133]:
def standardize_data(test):
    return (test.mean()/test.std())

standardize_data(df['scores'])

0.2547177877876271

In [134]:
def standardize_data_scores(d_f):
    return d_f.apply(standardize_data)

standardize_data_scores(df)

A        -0.308670
scores    0.254718
dtype: float64