# NumPy

NumPy is a library that is written in C programming language. Since C is a statically-typed programming language, it is much more faster than dynamically-typed languages like Python.

## Basics

In [None]:
# Arrays: faster way to handle data in Python
import numpy as np

array01 = np.array([1, 5, 9, 3, 5, 7]) # to create an array
# arrays are type-sensitive so all elements should have same type

array02 = np.array([range(i, i + 3) for i in [2, 4, 6]])
# we can create multi-dimensional arrays
print(array02)
print()

# Create a length-10 integer array filled with zeros
array03 = np.zeros(10, dtype=int) # dtype: to define type of elements in an array
print(array03)
print()

# Create a 3x5 floating-point array filled with ones
array04 = np.ones((3, 5), dtype=float)
print(array04)

# Create a 3x5 array filled with 3.14
np.full((3, 5), 3.14)

# Create an array filled with a linear sequence
# Starting at 0, ending at 20, stepping by 2
# (this is similar to the built-in range() function)
np.arange(0, 20, 2)

# Create an array of five values evenly spaced between 0 and 1
np.linspace(0, 1, 5)

# Create a 3x3 array of uniformly distributed
# random values between 0 and 1
np.random.random((3, 3))

# Create a 3x3 array of normally distributed random values
# with mean 0 and standard deviation 1
np.random.normal(0, 1, (3, 3))

# Create a 3x3 array of random integers in the interval [0, 10)
np.random.randint(0, 10, (3, 3))

## Array Attributes

In [None]:
import numpy as np

# Array Attributes

# Creating a three-dimensional array
x1 = np.random.randint(10, size=(3, 4, 5))
# size=(number of dimension, size of dimension's rows, size of dimension's columns)
print(x1)
print("--------------------------------------------------------")
# Each array has attributes ndim (the number of dimensions), shape (the size of each dimension) and size (total)
print("x1 ndim: ", x1.ndim)
print("x1 shape:", x1.shape)
print("x1 size: ", x1.size) # 3*4*5

# Another useful attribute is the dtype, the data type of the array
print("x1 dtype:", x1.dtype)
print("--------------------------------------------------------")
# Other attributes include itemsize, which lists the size (in bytes) of each array element
# and nbytes, which lists the total size (in bytes) of the array


# Array Indexing

# Values can be modified using any of the index notation
print(x1[0, 0])
x1[0, 0, 0] = 5
print(x1[0, 0])
print("--------------------------------------------------------")

# Array Slicing

# x[start:stop:step] If any of these are unspecified
# they default to the values start=0, stop=size of dimension, step=1

x2 = np.arange(10)
x2_sub = x2[1::2] # x2_sub is a subarray of x2. Since it is not a copy, changings in subarray affects main array
x2_sub_copy = x2_sub.copy() # to create a copy of an array
print(x2)
print(x2_sub)
print("--------------------------------------------------------")

# Reshaping of Arrays

grid = np.arange(1, 10).reshape((3, 3))
print(grid)
print("--------------------------------------------------------")

# Array Concatenation and Splitting

x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = np.concatenate([x, y]).reshape((2, 3)) # Concatenates two arrays
print(z)
print()

x[0] = 9
# vertically stack the arrays
print(np.vstack([x, z]))
print()

y = np.array([[99],
              [99]])
print(np.hstack([z, y]))
print("--------------------------------------------------------")

# Splitting of arrays
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5])
print(x1, x2, x3)
print("--------------------------------------------------------")


# Computation on NumPy

# np.absolute() or np.abs()
# np.sin(), cos, tan
# np.power(3, x)
# np.log(x)


# Aggregations on NumPy

# np.sum()
# np.max(), np.min()


# Broadcasting (Yaymak)

x = np.arange(3)
y = np.arange(3)[:, np.newaxis]
print(x, end="\n")
print(y, end="\n")
print(x+y)
print("--------------------------------------------------------")


# Boolean Arrays

#np.all(), np.any() give boolean expression

# Masking
x = np.arange(10)
masked = x[x>5] # gives an array filled with all the values that meet the condition
print(masked)

## Indice and Sorting

In [None]:
import numpy as np

x = np.arange(2, 7)
np.random.shuffle(x)
print(x)
i = np.argsort(x) # gives indices of sorted array
print(i)
#%%
import numpy as np

rand = np.random.RandomState(42) # ?
z = rand.randint(5, 35, (4, 4)) # random.randint(low, high=None, size=None, dtype=int)
print(z)
print()

print(np.sort(z, axis=0))
print()

print(np.sort(z, axis=1))
#%%
import numpy as np

x = np.array([5, 8, 7, 6, 8, 2, 1, 3, 0])
print(np.partition(x, 4)) # returns an array which is sorted wrt the key value. left of array is smaller than key
#%% Structured Data
import numpy as np

# Imagine that we have several categories of data on a number of people
# To save this data on a single structure, we use structured datas
name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25, 45, 37, 19]
weight = [55.0, 85.5, 68.0, 61.5]

# We can create a structured array using a compound data type specification
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'),
                          'formats':('U10', 'i4', 'f8')})
print(data.dtype)
print()

data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)
#%% How to get the minimum and maximum value of a given NumPy array along the second axis?
import numpy as np
data = [[10, 10, 90, 90],
        [40, 40, 60, 60],
        [55, 55, 65, 65],
        [10, 30, 60, 90]]
a1 = np.array(data)
amax = np.amax(a1, 0) # First axis means column-wise and second axis means row-wise.
# For amax, by default: axis=0 means second axis(row-wise)
amax2 = np.max(a1, 0) # We can also use max function with axis attribute instead of amax function.
# For max, by default: axis=0 means first axis(column-wise)
amin = np.amin(a1, 1)
amin2 = np.min(a1, axis=1)

print(amax)
print(amax2)
print(amin)
print(amin2)
#%%
# Pandas

# Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame.
# DataFrames are essentially multidimensional arrays with attached row and column labels.
# Series and DataFrame objects, builds on the NumPy array structure and provides efficient access to these sorts of "data munging" tasks.

# The Pandas Series Object
# A Pandas Series is a one-dimensional array of indexed data (similar to dictionary). It can be created from a list or array.

import numpy as np
import pandas as pd

data = pd.Series([0.25, 0.50, 0.75, 1.00])

print(data.values) # gives values of the series
print(data.index) # gives indexes of the series, same with print(data.keys())
print(list(data.items()))
print()

print(data[1:3]) # we can use slicing
print()

print(data[(data > 0.3) & (data < 0.8)]) # we can use masking
#%%
import numpy as np
import pandas as pd

data = pd.Series([0.25, 0.50, 0.75, 1.00], index=["a", "b", "c", "d"], name="data") # we can adjust indexing type of a series object
print(data["b":"c"]) # same with print(data[1:3]), it will include "c"
print()

data = pd.Series({"a":1, "b":2}) # we can use a dictionary to create a series
print(data)
#%% The Pandas Dataframe Object

# If a Series is an analog of a one-dimensional array with flexible indices, a DataFrame is an analog of a two-dimensional array
# with both flexible row indices and flexible column names.

import numpy as np
import pandas as pd

population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

states = pd.DataFrame({'population': population,
                       'area': area})
print(states)
print(states.index) # gives the rows of the dataframe
print(states.columns) # gives the columns of the dataframe
print(states.area) # same with states["area"]
print(states["population"]) # same with states.population

states["density"] = states["population"] / states["area"] # we can add a new column by doing an operation

print(states)

# Even if some keys in the dictionary are missing, Pandas will fill them in with NaN
# https://www.analyticsvidhya.com/blog/2021/11/a-simple-guide-to-pandas-dataframe-operations/
import numpy as np
import pandas as pd

# to access the data of a dataframe, we can use this method
points_table = {'Team':['MI', 'CSK', 'Devils', 'MI', 'CSK',
   'RCB', 'CSK', 'CSK', 'KKR', 'KKR', 'KKR', 'RCB'],
   'Rank' :[1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
   'Year' :[2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Point':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(points_table)

# df.head() gives the first n rows of the data, default=5
# df.tail() gives the last n rows of the data, default=5
# df.sample() gives random n rows of the data, default=5

print(df.columns, end="\n\n") # this will return all columns of the df as an Index object

# So we can access the columns of the data
print(df[0:2], end="\n\n") # accesing rows by using indexing

# To choose certain columns and rows of a dataframe
teams = df[["Team", "Rank"]].iloc[[2, 4, 6]] # for a dataframe: first index is column, for iloc and loc: first index is row
print(teams, end="\n\n")

# We can narrow down the df by using masking
best = df.loc[df['Rank'] == 3] # we can use comparison operators(|(or), &(and))
print(best, end="\n\n")

teams = df["Team"].tolist() # we can also manipulate the data by turning into a list
teams[0] = "RMA"
df["Team"] = teams
print(df.head(), end="\n\n")

# to read a csv file : dataframe = pd.read_csv("data.csv")

# dataframe.isna().sum()

# We use the drop(), dropna(), fill(), fillna() functions for deleting and filling the specific column or to delete the multiple columns at the same time.

# DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=’raise’)
# labels: single label or list (index or name of the columns to drop). 
# axis: {0 or ‘index’, 1 or ‘columns’}, it’s default value is 0.
# columns: It is the same as the label or we can say that it is an alternative to specify the names of the attributes (colums=labels).
# level: If there are multiple indexes present in the DataFrame then we will pass the level.
# inplace: If false then return a copy. Otherwise do operation inplace and return none.
import numpy as np
import pandas as pd

df = pd.read_csv("data.csv")

newDF = df.copy().head(20) # to return a copy not view

M = newDF["Calories"].mean()
newDF["Calories"].fillna(M, inplace=True)

# inplace: If false then return a copy. Otherwise do operation inplace and return none.

newDF["Pulse"].fillna(0, inplace=True)

newDF = newDF.dropna() # default: inplace=False, it returns a copy.

print(newDF)
print()
last = newDF.loc[newDF["Maxpulse"] > 135]
print(last)
#%% A Pandas DataFrame operates much like a structured array, and can be created directly from one
import numpy as np
import pandas as pd

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])

data = pd.DataFrame(A)
print(data)
#%% The Pandas Index Object: Immutable array, duplication allowed
import numpy as np
import pandas as pd

indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indC = indA.intersection(indB) # union() for
print(indC)
#%% Indexers: loc, iloc, and ix(not working)
import numpy as np
import pandas as pd

data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])

# loc attribute allows indexing and slicing that always references the explicit index(starting from 1)
print(data.loc[1:3])
print()

# iloc attribute allows indexing and slicing that always references the implicit Python-style index(starting from 0)
print(data.iloc[1:3])