In [48]:
# Series - From - List, Dictionary, Array
# Operation on Series
# Add

# Dataframes - From - Array, List, Dictionary
# Operations on DF
# Append, Conactenate, Merge, Merge_Time_Series, Join, GroupBy, Subset, Data Standardization

In [3]:
import pandas as pd
import numpy as np                      # Numerical Python
from numpy.random import randn          # To generate random number

## Create Series (1-D array)

In [4]:
# pd.Series(data=, index=, columns=, dtype=, copy)

pd.Series(5 , index = ['a','b','c'])

a    5
b    5
c    5
dtype: int64

#### Usign List

In [5]:
list_1 = [1,2,3,4]

pd.Series(data = list_1)

0    1
1    2
2    3
3    4
dtype: int64

#### Using Dictionary

In [6]:
dict_1 = {'K1':[10,20,30],'K2':[40,50,60]}

pd.Series(data = dict_1)                       # Subset of Dataframe with only one Column

K1    [10, 20, 30]
K2    [40, 50, 60]
dtype: object

#### Using Arrays

In [7]:
np_array = np.array(['India','US'])

print('Country Array : ', np_array)

pd.Series(np_array)

Country Array :  ['India' 'US']


0    India
1       US
dtype: object

#### Adding Series

In [8]:
first_vector_series = pd.Series([10,20,30],index = ['a','b','c'])
second_vector_series = pd.Series([1,2,3],index = ['b','c','d'])

# Sum of vectors
print('With NULLs \n', first_vector_series + second_vector_series)

print('\n')

# Imputing and Removing NULL
print('Without NULLs \n', first_vector_series.add(second_vector_series, fill_value=0))

With NULLs 
 a     NaN
b    21.0
c    32.0
d     NaN
dtype: float64


Without NULLs 
 a    10.0
b    21.0
c    32.0
d     3.0
dtype: float64


## Create DataFrames (2-D array)

#### Using Random Numbers

In [9]:
np.random.seed(101)                     

# We get same set of numbers
# Here 101 is any random number
# 0, 7, 42, 100, 101, 200 etc are more frequently used but no difference behind the scene

In [10]:
# randn(row, column)

randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [11]:
# Creates random values with Guassian Distribution
# pd.DataFrame(data= , index = , columns = , dtype = , copy)

df = pd.DataFrame(data=randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])   # 5 rows and 4 columns
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


#### Using List

In [12]:
df_from_list = [1,2,3,4]

df_list = pd.DataFrame(data = df_from_list, index=['1','2','3','4'], columns=['col_1'])
df_list

Unnamed: 0,col_1
1,1
2,2
3,3
4,4


#### Using Dictionary

In [13]:
dict_1 = {'K1':[10,20,30],'K2':[40,50,60]}

df_dict = pd.DataFrame(data = dict_1)          # Keys will be column names if not provided explicitly

df_dict

Unnamed: 0,K1,K2
0,10,40
1,20,50
2,30,60


## Operation on DF

### Basic

#### Add Column to DF

In [17]:
cnt = 'CA IN SL'
country = cnt.split()

df1 = pd.DataFrame(randn(3,2))

df1['Country'] = country                                      # Length of list should match the shape of Dataframe
df1

Unnamed: 0,0,1,Country
0,-0.755325,-0.346419,CA
1,0.147027,-0.479448,IN
2,0.558769,1.02481,SL


#### Drop Column from DF

In [18]:
# axis = 0 represents 'remove row'
# axis = 1 represents 'remove column'
# inplace = True represents 'changes will be saved'

df1.drop('Country', axis = 1, inplace = True)
df1

Unnamed: 0,0,1
0,-0.755325,-0.346419
1,0.147027,-0.479448
2,0.558769,1.02481


### Advanced Operations

#### Append

In [20]:
# Append works on TWO Dataframes ONLY
# Works like stacking - One over another
# Append like appending data at the end of a file
# df1.append(df2) - df2 will be at the end of df1 (Row vise)
# Common columns will get appended to have one column in final

df1 = pd.DataFrame(data=randn(3,2),columns=['A','B'])
df2 = pd.DataFrame(data=randn(2,2),columns=['B','C'])
df3 = pd.DataFrame(data=randn(2,2),columns=['C','D'])


print(df1.append(df3))

print('\n')

print(df1.append(df2))

print('\n')

print(df2.append(df3))

                                              

          A         B         C         D
0  0.187125 -0.732845       NaN       NaN
1 -1.382920  1.482495       NaN       NaN
2  0.961458 -2.141212       NaN       NaN
0       NaN       NaN -1.467514 -0.494095
1       NaN       NaN -0.162535  0.485809


          A         B         C
0  0.187125 -0.732845       NaN
1 -1.382920  1.482495       NaN
2  0.961458 -2.141212       NaN
0       NaN  0.992573  1.192241
1       NaN -1.046780  1.292765


          B         C         D
0  0.992573  1.192241       NaN
1 -1.046780  1.292765       NaN
0       NaN -1.467514 -0.494095
1       NaN -0.162535  0.485809


#### Concatenate

In [21]:
# pandas.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=None, copy=True)

In [22]:
# pd.concat([LIST of Dataframes])                     

# Similar to APPEND but we can concatenate more than TWO dataframes 
# By default axis = 0 and it concatenates row-vise
# Cocatenation adds up rows Vertically when axis = 0 and horizontally when axis = 1
# Concatenated DataFrame contains more number of Rows for axis = 0 and more number of Cols for axis = 1 

In [23]:
# Creating first dataframe

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3'], 
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']}, 
                    index = [0, 1, 2, 3])

# Creating second dataframe 
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 
                    'B': ['B4', 'B5', 'B6', 'B7'], 
                    'C': ['C4', 'C5', 'C6', 'C7'], 
                    'D': ['D4', 'D5', 'D6', 'D7']}, 
                    index = [0, 1, 2, 3]) 

# Creating third dataframe 
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], 
                    'B': ['B8', 'B9', 'B10', 'B11'], 
                    'C': ['C8', 'C9', 'C10', 'C11'], 
                    'D': ['D8', 'D9', 'D10', 'D11']}, 
                    index = [0, 1, 2, 3])

# Creating Fourth dataframe
df4 = pd.DataFrame({
    'A':['AA','AB','AC',float('nan')],
    'E':['E0','E1','E2',float('nan')]
})

In [24]:
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7
0,A8,B8,C8,D8
1,A9,B9,C9,D9


In [25]:
pd.concat([df1,df2,df3]).reset_index(drop = True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [29]:
pd.concat([df1,df4], sort=False)

Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
0,AA,,,,E0
1,AB,,,,E1
2,AC,,,,E2
3,,,,,


In [30]:
df_keys = pd.concat([df1,df2], keys = ['X','Y'])

print(df_keys)

df_keys.loc['Y']

      A   B   C   D
X 0  A0  B0  C0  D0
  1  A1  B1  C1  D1
  2  A2  B2  C2  D2
  3  A3  B3  C3  D3
Y 0  A4  B4  C4  D4
  1  A5  B5  C5  D5
  2  A6  B6  C6  D6
  3  A7  B7  C7  D7


Unnamed: 0,A,B,C,D
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [33]:
# pd.concat([df1,df2], axis = 0)                # default, Concats Rows

pd.concat([df1,df2], axis = 1)                  # Concats Columns

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


#### Merge

In [34]:
# Works on Column and doesn't care about Indexes
# Atleast One column should be common between Left and Right Dataframes

# pd.merge(left,right,how = 'inner', on = 'key')

In [39]:
left_df = pd.DataFrame({'key':['K0','K1','K2','K2'],
       'A':['A0','A1','A2','A3'],
       'B':['B0','B1','B2','B3']})

right_df = pd.DataFrame({'key':['K0','K0','K0','K0'],
       'A':['C0','C1','C2','C3'],
       'C':['C0','C1','C2','C3'],
       'D':['D0','D1','D2','D3']})

In [40]:
# INNER MERGE
# Like Inner Join in SQL on Key Column

pd.merge(left_df, right_df, how = 'inner', on = ['key'])

Unnamed: 0,key,A_x,B,A_y,C,D
0,K0,A0,B0,C0,C0,D0
1,K0,A0,B0,C1,C1,D1
2,K0,A0,B0,C2,C2,D2
3,K0,A0,B0,C3,C3,D3


In [41]:
# OUTER MERGE
# Like Outer Join in SQL on Key Column

pd.merge(left_df, right_df, how = 'outer', on = ['key'], suffixes = ['_left','_right'])

Unnamed: 0,key,A_left,B,A_right,C,D
0,K0,A0,B0,C0,C0,D0
1,K0,A0,B0,C1,C1,D1
2,K0,A0,B0,C2,C2,D2
3,K0,A0,B0,C3,C3,D3
4,K1,A1,B1,,,
5,K2,A2,B2,,,
6,K2,A3,B3,,,


#### Merge TimeSeries

In [42]:
# It is similar to an ordered left-join except that you match 'ON' Nearest Key rather than Equal Keys
# Optionally we have a group-wise merge using 'BY' option. Here the Keys should be Equal instead to Nearest

In [43]:
trades = pd.DataFrame({
    'time': pd.to_datetime(['20160525 13:30:00.023',
                            '20160525 13:30:00.038',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.048']),
    'ticker': ['MSFT', 'MSFT','GOOG', 'GOOG', 'AAPL'],
    'price': [51.95, 51.95,720.77, 720.92, 98.00],
    'quantity': [75, 155,100, 100, 100]},
    columns=['time', 'ticker', 'price', 'quantity'])

In [44]:
quotes = pd.DataFrame({
    'time': pd.to_datetime(['20160525 13:30:00.023',
                            '20160525 13:30:00.023',
                            '20160525 13:30:00.030',
                            '20160525 13:30:00.041',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.049',
                            '20160525 13:30:00.072',
                            '20160525 13:30:00.075']),
    'ticker': ['GOOG', 'MSFT', 'MSFT','MSFT', 'GOOG', 'AAPL', 'GOOG','MSFT'],
    'bid': [720.50, 51.95, 51.97, 51.99,720.50, 97.99, 720.50, 52.01],
    'ask': [720.93, 51.96, 51.98, 52.00,720.93, 98.01, 720.88, 52.03]},
    columns=['time', 'ticker', 'bid', 'ask'])

In [45]:
trades

Unnamed: 0,time,ticker,price,quantity
0,2016-05-25 13:30:00.023,MSFT,51.95,75
1,2016-05-25 13:30:00.038,MSFT,51.95,155
2,2016-05-25 13:30:00.048,GOOG,720.77,100
3,2016-05-25 13:30:00.048,GOOG,720.92,100
4,2016-05-25 13:30:00.048,AAPL,98.0,100


In [46]:
quotes

Unnamed: 0,time,ticker,bid,ask
0,2016-05-25 13:30:00.023,GOOG,720.5,720.93
1,2016-05-25 13:30:00.023,MSFT,51.95,51.96
2,2016-05-25 13:30:00.030,MSFT,51.97,51.98
3,2016-05-25 13:30:00.041,MSFT,51.99,52.0
4,2016-05-25 13:30:00.048,GOOG,720.5,720.93
5,2016-05-25 13:30:00.049,AAPL,97.99,98.01
6,2016-05-25 13:30:00.072,GOOG,720.5,720.88
7,2016-05-25 13:30:00.075,MSFT,52.01,52.03


In [47]:
df_merge_asof = pd.merge_asof(trades, quotes,
              on='time',
              by='ticker')

df_merge_asof

Unnamed: 0,time,ticker,price,quantity,bid,ask
0,2016-05-25 13:30:00.023,MSFT,51.95,75,51.95,51.96
1,2016-05-25 13:30:00.038,MSFT,51.95,155,51.97,51.98
2,2016-05-25 13:30:00.048,GOOG,720.77,100,720.5,720.93
3,2016-05-25 13:30:00.048,GOOG,720.92,100,720.5,720.93
4,2016-05-25 13:30:00.048,AAPL,98.0,100,,


#### Join

In [None]:
# Join works on Indexes
# 2 Dataframe can't be merged if they don't have any common column name
# But 2 DF can be JOINED on Indexes

In [None]:
left_df = pd.DataFrame(
    {
       'A':['A0','A1','A2','A3'],
       'B':['B0','B1','B2','B3']
    })

right_df = pd.DataFrame(
    {
       'C':['C0','C1','C2','C3'],
       'D':['D0','D1','D2','D3'],
       'E':['E0','E1','E2','E3'],
    })

In [None]:
# left_df.merge(right_df)               # won't work here because there is no common column

left_df.join(right_df)                  # Inner Join - Default (using Indxes)

In [None]:
left_df.join(right_df, how = 'outer')

#### Group By

In [None]:
# grouped = df.groupby('col_name')
# grp_data = grouped.get_group('data_element')

In [None]:
# df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

In [None]:
file = 'C:/Users/G331623/OneDrive - Principal Financial Group/Data Science/Github Python/Pandas/Data/wiki_movie_plots.csv'
df_file = pd.read_csv(file)

df_file.drop('Plot', axis = 1, inplace=True)

In [None]:
# Creating groups

gp = df_file.groupby('Origin/Ethnicity')

gp_mult = df_file.groupby(['Origin/Ethnicity', 'Release Year'])

In [None]:
# gp.first()    #  First element of each Group
gp_mult.first()  # First element of each sub-category, e.g. 'Release Year' here

# gp.get_group('American')     -  All elements of a particular Group

#### Subset in DF

In [None]:
df1 = pd.DataFrame(randn(3,2), columns=['A','B'])

df1

In [None]:
df1[df1['A']>0]                          # Column 'A' greater than 0

In [None]:
df1[df1['A']>0]['B']                     # Column 'A' greater than 0 and returns only B column

In [None]:
df1[(df1['A']<0) & (df1['B']>0)]         # AND - Multiple conditions while subsetting

In [None]:
df1[(df1['A']<0) | (df1['B']>0)]         # OR - Multiple conditions while subsetting - OR

#### Data Standardization

In [None]:
# Data should be standardize when we have very distorted data or we have any wiered Histogram

df = pd.DataFrame(randn(5,2), columns = ['A','Scores'])
df

In [None]:
# Mean of a column
print('Mean of Scores        : ', df['Scores'].mean())

# Validation of Mean
print('Manual Mean of Scores : ', sum(df['Scores'])/len(df))


# Standard Deviation
print('Standard Deviation    : ', df['Scores'].std())

In [None]:
# Standard Data = Mean / Standard_Devaiation

print(df, '\n')

def standardize_data(data):
    return (data.mean()/data.std())


def standardize_data_scores(d_f):
    return d_f.apply(standardize_data)


std_data = standardize_data_scores(df)
print('Standardized : \n', std_data)