# Module 7 Intro to **NDarray**

In [None]:
import numpy as np
x = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
x

array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)

In [None]:
type(x)

numpy.ndarray

In [None]:
x.shape

(2, 3)

In [None]:
x.dtype

dtype('int32')

In [None]:
x[1,2]

6

In [None]:
y = x[:,1]
y

array([2, 5], dtype=int32)

In [None]:
y[0] = 9
y

array([9, 5], dtype=int32)

In [None]:
x

array([[1, 9, 3],
       [4, 5, 6]], dtype=int32)

# **# Module 7 Numpy Basic Operations**

In [None]:
A = np.array([[3,2],[0,1]])
B = np.array([[3,1],[2,1]])
A+B

array([[6, 3],
       [2, 2]])

In [None]:
A*B

array([[9, 2],
       [0, 1]])

In [None]:
A@B

array([[13,  5],
       [ 2,  1]])

In [None]:
A.dot(B)

array([[13,  5],
       [ 2,  1]])

In [None]:
A.transpose()

array([[3, 0],
       [2, 1]])

In [None]:
np.linalg.inv(A)

array([[ 0.33333333, -0.66666667],
       [ 0.        ,  1.        ]])

# **Module 8** **NDarray** **Statistics**

In [None]:
import numpy as np
normal_array = np.random.normal(5, 0.5, 10)
print(normal_array)

[4.67511302 5.8306368  5.23662839 4.5302857  4.77728861 5.0535762
 5.20500984 4.57895788 5.03124508 5.32208671]


In [None]:
## Min 
print(np.min(normal_array))

### Max 
print(np.max(normal_array))

### Mean 
print(np.mean(normal_array))

### Median
print(np.median(normal_array))

### Sd
print(np.std(normal_array))

4.530285699255719
5.830636795381895
5.024082821957012
5.042410637801121
0.37984550888881646


# **Module 9 Numpy NDarray Concatenate**

In [None]:
array = np.arange(9)
array

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [None]:
array2D_1 = array.reshape((3,3))
array2D_1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [None]:
array2D_2 = np.arange(10,19).reshape(3,3)
array2D_2

array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [None]:
np.concatenate((array2D_1, array2D_2))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [None]:
np.concatenate((array2D_1,array2D_2),axis=1)

array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

In [None]:
np.concatenate((array2D_1, array2D_2, array2D_1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8]])

In [None]:
np.vstack((array2D_1, array2D_2))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [None]:
np.hstack((array2D_1, array2D_2))

array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

# **Module 10 Pandas Missing Data**

In [None]:
import numpy as np
dict = {'First Score':[100, 90, np.nan, 95], 
        'Second Score': [30, 45, 56, np.nan], 
        'Third Score':[np.nan, 40, 80, 98]} 
dict

{'First Score': [100, 90, nan, 95],
 'Second Score': [30, 45, 56, nan],
 'Third Score': [nan, 40, 80, 98]}

In [None]:
import pandas as pd
df = pd.DataFrame(dict) 
# using isnull() function   
df.isnull() 

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


# **Numpy Missing Data**

In [None]:
np.array([1.0, 2.0, np.NaN, 7.0])

array([ 1.,  2., nan,  7.])

In [None]:
# sample input ndarray:
x = np.array([1.0, 2.5, np.nan, 1.3, np.inf, 7.2])
print("input array with bad values:")
print(x)

xm = np.ma.masked_invalid(x)
print("masked version:")
print(xm)

input array with bad values:
[1.  2.5 nan 1.3 inf 7.2]
masked version:
[1.0 2.5 -- 1.3 -- 7.2]


In [None]:
x = np.ma.array([1, 2, 3], mask=[False, False, False])
y = np.ma.array([1, 0, 1])
print(x * y)
print(x/y)
z = np.array([1, np.nan, 2])
print(x * z)
print (x/z)

[1 0 3]
[1.0 -- 3.0]
[1.0 nan 6.0]
[1.0 -- 1.5]


# **Module 10 Duplicated Data**

In [None]:
# importing pandas package 
import pandas as pd 

# making data frame from csv file 
data = pd.read_csv("./sample_data/employees.csv") 

# sorting by first name 
data.sort_values("First Name", inplace = True) 

# making a bool series 
bool_series = data["First Name"].duplicated() 
bool_series



101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [None]:
# displaying data 
print(data.head()) 

# display data 
data[bool_series] 


    First Name Gender Start Date  ... Bonus %  Senior Management             Team
101      Aaron   Male  2/17/2012  ...  11.849               True        Marketing
327      Aaron   Male  1/29/1994  ...   5.097               True        Marketing
440      Aaron   Male  7/22/1990  ...  11.343               True  Client Services
937      Aaron    NaN  1/22/1986  ...  18.424              False  Client Services
137       Adam   Male  5/21/2011  ...  15.120              False     Distribution

[5 rows x 8 columns]


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1/29/1994,6:48 PM,58755,5.097,True,Marketing
440,Aaron,Male,7/22/1990,2:53 PM,52119,11.343,True,Client Services
937,Aaron,,1/22/1986,7:39 PM,63126,18.424,False,Client Services
141,Adam,Male,12/24/1990,8:57 PM,110194,14.727,True,Product
302,Adam,Male,7/5/2007,11:59 AM,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,5/23/2001,7:52 PM,103877,6.322,,Distribution
925,,Female,8/23/2000,4:19 PM,95866,19.388,,Sales
946,,Female,9/15/1985,1:50 AM,133472,16.941,,Distribution
947,,Male,7/30/2012,3:07 PM,107351,5.329,,Marketing


In [None]:


# passing NOT of bool series to see unique values only 
data = data[~bool_series] 

# displaying data 
print(data.info() )
data 


<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 101 to 7
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         200 non-null    object 
 1   Gender             178 non-null    object 
 2   Start Date         201 non-null    object 
 3   Last Login Time    201 non-null    object 
 4   Salary             201 non-null    int64  
 5   Bonus %            201 non-null    float64
 6   Senior Management  200 non-null    object 
 7   Team               197 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 14.1+ KB
None


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2/17/2012,10:20 AM,61602,11.849,True,Marketing
137,Adam,Male,5/21/2011,1:45 AM,95327,15.120,False,Distribution
300,Alan,Male,6/26/1988,3:54 AM,111786,3.592,True,Engineering
372,Albert,Male,2/1/1997,4:20 PM,67827,19.717,True,Engineering
988,Alice,Female,10/5/2004,9:34 AM,47638,11.209,False,Human Resources
...,...,...,...,...,...,...,...,...
433,Wanda,Female,7/20/2008,1:44 PM,65362,7.132,True,Legal
177,Wayne,Male,4/7/2012,8:00 AM,102652,14.085,True,Distribution
820,William,Male,11/18/1993,12:27 PM,54058,5.182,True,Human Resources
450,Willie,Male,8/22/2009,1:03 PM,55038,19.691,False,Legal
