### 1. Comparison between Python Sequences and NumPy Sequences

In [None]:
# importing numpy package
import numpy as np
  
# importing system module
import sys
  
# declaring a list of 1000 elements 
S= range(1000)
  
# printing size of each element of the list
print("[Python] Size of each element of list in bytes: ",sys.getsizeof(S))
  
# printing size of the whole list
print("[Python] Size of the whole list in bytes: ",sys.getsizeof(S)*len(S))
  
# declaring a Numpy array of 1000 elements 
D= np.arange(1000)
  
# printing size of each element of the Numpy array
print("[NumPy] Size of each element of the Numpy array in bytes: ",D.itemsize)
  
# printing size of the whole Numpy array
print("[NumPy] Size of the whole Numpy array in bytes: ",D.size*D.itemsize)

In [None]:
# importing required packages
import numpy
import time
   
# size of arrays and lists
size = 1000000  
   
# declaring lists
list1 = range(size)
list2 = range(size)
   
# declaring arrays
array1 = numpy.arange(size)  
array2 = numpy.arange(size)
   
# capturing time before the multiplication of Python lists
initialTime = time.time()
  
# multiplying  elements of both the lists and stored in another list
resultantList = [(a * b) for a, b in zip(list1, list2)]
   
# calculating execution time
print("Time taken by Lists to perform multiplication:", 
      (time.time() - initialTime),
      "seconds")
   
# capturing time before the multiplication of Numpy arrays
initialTime = time.time()
  
# multiplying  elements of both the Numpy arrays and stored in another Numpy array 
resultantArray = array1 * array2
   
# calculating execution time 
print("Time taken by NumPy Arrays to perform multiplication:",
      (time.time() - initialTime),
      "seconds")

### 2. Basics of Pandas Library

In [None]:
import pandas as pd

In [None]:
path = r"C:\Users\Purushotham\Desktop\deloitte\2022\July\week 01\python\13_case_study\students.csv"

In [None]:
df = pd.read_csv(path)
df

##### Working with Columns

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe

In [None]:
df['name']

In [None]:
df1 = df[['rank', 'name', 'age', 'avg']]
df1

In [None]:
# Creating a new column
df['dummy'] = df['name']
df['total'] = df['phy'] + df['chem'] + df['bio'] + df['math']
df

In [None]:
# Delete a columns
df2 = df.drop('dummy', axis = 1)
df2

In [None]:
df.head()

In [None]:
# Dropping a column permanently from the dataframe
df.drop('dummy', axis = 1, inplace = True)
df

##### Working with the Rows

In [None]:
df.loc[1]

In [None]:
df.loc[[1, 5, 8]]

In [None]:
df.iloc[[3,1,5,9]]

In [None]:
df.drop([1, 2, 3]) # inplace = True to permanently delete

In [None]:
df

In [None]:
df[df['total'] < 300]

### 3. Solve the student report project using pandas

In [None]:
# We are supposed to calculate the average and rank for the student data
# Write it to a file

In [None]:
import pandas as pd
df = pd.read_csv(path)
df.head()

In [None]:
df['avg'] = df[['phy', 'chem', 'math', 'bio']].mean(axis=1)
df.head()

In [None]:
df['rank'] = df['avg'].rank(method = 'dense', ascending = False)
df

In [None]:
df.to_excel('finalreport.xlsx')

### 4. Reading from Different Sources

In [None]:
url = r"https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/"

In [None]:
df_fdic = pd.read_html(url)
df_fdic[0]

In [None]:
type(df_fdic)

### 5. Pandas Series

In [None]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20,'c':30}

In [None]:
# Using lists
pd.Series(data=my_list)

In [None]:
pd.Series(data=my_list, index = labels)

In [None]:
# Using a numpy array
pd.Series(arr, labels)

In [None]:
# Using a dictionary
pd.Series(d)

In [None]:
pd.Series([sum, len, print])

In [None]:
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])  
ser2 = pd.Series([1,2,5,4],index = ['USA', 'Germany','Italy', 'Japan']) 

In [None]:
ser1

In [None]:
ser2

In [None]:
ser1['USA']

In [None]:
ser1 + ser2

### 6. Pandas DataFrames

In [None]:
from numpy.random import randn
np.random.seed(101)

In [None]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df

In [None]:
df['W']

In [None]:
df[['W', 'Z']]

In [None]:
df.W

In [None]:
# Every column in a dataframe is nothing but a pandas series

##### Create a new column

In [None]:
df['new'] = df['W'] + df['Y']
df

##### Removing a Colummns

In [None]:
df.drop('new', axis=1, inplace=True)

In [None]:
df

##### Selecting rows

In [None]:
df.loc['A']

In [None]:
df.loc[['A', "B"]]

In [None]:
df.loc['B', 'Y'] # [row, column]

In [None]:
df.loc[['A', 'B'], ['W', 'Y']]

In [None]:
df.iloc[1] # Access using indeces

In [None]:
df.iloc[1, 3]

##### Conditional Selection

In [None]:
df

In [None]:
df > 0

In [None]:
df[df > 0]

In [None]:
df[df['W'] > 0]

In [None]:
df[df['W'] > 0][['X', 'Y']]

In [None]:
df[(df['W'] > 0) & (df['Y'] > 1)]

##### Working with indexes

In [None]:
df

In [None]:
df.reset_index()

In [None]:
newind = ['CA', 'NY', 'WY', 'TX', 'NO']
df['States'] = newind

In [None]:
df

In [None]:
df.set_index('States')

In [None]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

In [None]:
df = pd.DataFrame(np.random.randn(6, 2), index=hier_index, columns=['A', "B"])
df

In [None]:
df.loc['G1']

In [None]:
df.loc['G1'].loc[1]

In [None]:
df.index.names

In [None]:
df.index.names = ['Group', 'Num']
df

In [None]:
df.xs('G1')

In [None]:
df.xs(['G1', 1])

In [None]:
df.xs(1, level='Num')

### 7. Handling Missing Values

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [None]:
df

In [None]:
df.dropna()

In [None]:
df.dropna(axis=1)

In [None]:
df.dropna(thresh=2)

In [None]:
df.fillna(value=50)

In [None]:
df['A'].fillna(value=df['A'].mean())

### 8. Merging, Joining and Concatenating DataFrames

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [None]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])

In [None]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [None]:
df1

In [None]:
df2

In [None]:
df3

##### Concatenation

In [None]:
pd.concat([df1, df2, df3])

In [None]:
pd.concat([df1, df2, df3], axis = 1)

##### Merging

In [None]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})    

In [None]:
left

In [None]:
right

In [None]:
pd.merge(left, right, how='inner', on='key')

In [None]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [None]:
right

In [None]:
left

In [None]:
pd.merge(left, right, on=['key1', 'key2'])

In [None]:
pd.merge(left, right, on=['key1', 'key2'], how='outer')

In [None]:
pd.merge(left, right, on=['key1', 'key2'], how='right')

In [None]:
pd.merge(left, right, on=['key1', 'key2'], how='left')

##### Joining

In [None]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [None]:
left

In [None]:
right

In [None]:
left.join(right)

In [None]:
right.join(left)

In [None]:
left.join(right, how='outer')

### 9. Grouping

In [None]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

In [None]:
df = pd.DataFrame(data)
df

In [None]:
df.groupby('Company')

In [None]:
by_company = df.groupby('Company')
by_company

In [None]:
by_company.mean()

In [None]:
by_company.sum()

In [None]:
by_company.std()

In [None]:
by_company.min()

In [None]:
by_company.max()

In [None]:
by_company.count()

In [None]:
by_company.describe()

In [None]:
by_company.describe().transpose()

In [None]:
by_company.describe().transpose()['GOOG']

### 10. Miscellaneous Operations

In [None]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()

##### Information on unique values

In [None]:
df['col2'].unique()

In [None]:
df['col2'].nunique()

In [None]:
df['col2'].value_counts()

##### Selecting data

In [None]:
newdf =  df[(df['col1'] > 2) & (df['col2'] == 444)]
newdf

##### Applying Functions

In [None]:
def times2(x):
    return x*2

In [None]:
df['col1'].apply(times2)

In [None]:
df['col3'].apply(len)

In [None]:
df['col1'].sum()

##### Sorting

In [None]:
df.sort_values(by = 'col2')

##### Checking for null values

In [None]:
df.isnull()

##### Pivot

In [None]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.pivot_table(values = 'D', index = ['A', "B"], columns=['C'])