Pre-process:

    1) Cleaning
    
        a) Fill missing:
        
            • Ignore row
            
            • Fill with constants, mean, median, mode
            
            • Predict the missing
            
        b) Smoothing noisy data:
        
            • Remove outliers
            
            • Discretization
            
            • Predict the noisy
            
    2) Reducing or adding
    
        a) Column reduction
        
            • Use correlation to remove redundant
            
            • Compression: PCA
            
        b) Row reduction
        
            • Sampling
            
            • Cluster
            
            • Aggregation
            
        c) feature construction
        
            • Polynomial
            
            • New features
            
    3) Transforming
    
        a) Normalization or min-max
        
        c) Ordinal encoding or one-hot encoding

In [22]:
from pandas import DataFrame, cut, qcut


df = DataFrame(data=[[2020, 1.5],
                     [2020, 1.7],
                     [2019, 1.9]],
               index=['iran', 'usa', 'japan'],
               columns=['year', 'pop'])

In [23]:
print(df.loc['iran'])

year    2020.0
pop        1.5
Name: iran, dtype: float64


In [24]:
print(df.iloc[0]) 

year    2020.0
pop        1.5
Name: iran, dtype: float64


In [25]:
print(df.columns)
print(df.values)

Index(['year', 'pop'], dtype='object')
[[2.020e+03 1.500e+00]
 [2.020e+03 1.700e+00]
 [2.019e+03 1.900e+00]]


In [26]:
df.year.value_counts()

2020    2
2019    1
Name: year, dtype: int64

In [27]:
# df.drop(0, inplace=True)
# df.drop('pop', axis=1, inplace=True)

In [28]:
df[0:3]

Unnamed: 0,year,pop
iran,2020,1.5
usa,2020,1.7
japan,2019,1.9


In [30]:
df[df.year > 2019]

Unnamed: 0,year,pop
iran,2020,1.5
usa,2020,1.7


In [35]:
df[(df.year > 2019) & (df['pop'] == 1.5)]

Unnamed: 0,year,pop
iran,2020,1.5


In [None]:
df[['pop','year']]

In [38]:
df.loc[df['year'] > 2019, ['pop','year']]

Unnamed: 0,pop,year
iran,1.5,2020
usa,1.7,2020


In [41]:
df[df['year'].isin([2019, 202])]

Unnamed: 0,year,pop
japan,2019,1.9


In [43]:
df.year.replace({2019: 19, 2020: 20}, inplace=True) # for order encode

In [45]:
df.iat[0, 1] = 1
df

Unnamed: 0,year,pop
iran,20,1.0
usa,20,1.7
japan,19,1.9


In [46]:
df.mean()

year    19.666667
pop      1.533333
dtype: float64

In [51]:
df.head()

Unnamed: 0,year,pop
iran,20,1.0
usa,20,1.7
japan,19,1.9


In [53]:
df.tail(1)

Unnamed: 0,year,pop
japan,19,1.9


In [55]:
from pandas import concat
df1 = DataFrame([['a', 1],
                ['b', 2]])
df2 = DataFrame([['c', 1],
                ['d', 2]])

In [56]:
concat([df1,df2])

Unnamed: 0,0,1
0,a,1
1,b,2
0,c,1
1,d,2


In [57]:
df3 = DataFrame([[True], [False]])
concat([df1,df3], axis=1)

Unnamed: 0,0,1,0.1
0,a,1,True
1,b,2,False


In [None]:
df = read_csv('file.csv')

In [58]:
from pandas import DataFrame, get_dummies
df = DataFrame([['Teh', 2],
                ['Tab', 4],
                ['Teh', 6]], 
               columns=['city', 'some_value'])

one_hot = get_dummies(df['city'])
df = df.drop('city', axis=1)
df = df.join(one_hot)
df

Unnamed: 0,some_value,Tab,Teh
0,2,0,1
1,4,1,0
2,6,0,1


In [59]:
df.sample(n=2)

Unnamed: 0,some_value,Tab,Teh
0,2,0,1
2,6,0,1


In [61]:
df.to_numpy()

array([[2, 0, 1],
       [4, 1, 0],
       [6, 0, 1]])

In [60]:
df.describe()

Unnamed: 0,some_value,Tab,Teh
count,3.0,3.0,3.0
mean,4.0,0.333333,0.666667
std,2.0,0.57735,0.57735
min,2.0,0.0,0.0
25%,3.0,0.0,0.5
50%,4.0,0.0,1.0
75%,5.0,0.5,1.0
max,6.0,1.0,1.0


In [64]:
for index, row in df.iterrows():
    print(row)

In [72]:
df = DataFrame([[None, 2],
                ['Tab', 4],
                ['Teh', None]], 
               columns=['city', 'some_value'])

In [73]:
print(df.isna().sum())

city          1
some_value    1
dtype: int64


In [74]:
df.dropna(how='any', inplace=True)
df

Unnamed: 0,city,some_value
1,Tab,4.0


In [None]:
df['some_value'].fillna(0, inplace=True)

In [75]:
df = DataFrame([['Teh', 2],
                ['Tab', 4],
                ['Teh', 6]], 
               columns=['city', 'some_value'])
df.groupby('city').mean()

Unnamed: 0_level_0,some_value
city,Unnamed: 1_level_1
Tab,4
Teh,4


In [86]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, scale, MinMaxScaler, KBinsDiscretizer, \
                                  PolynomialFeatures, minmax_scale
from math import nan
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [83]:
X_train = [[1, 2], [3, 4]]
scale(X_train) # x - avg / std -> avg wil be 0

array([[-1., -1.],
       [ 1.,  1.]])

In [87]:
minmax_scale(X_train)

array([[0., 0.],
       [1., 1.]])

In [88]:
imp = IterativeImputer(max_iter=10, random_state=0, missing_values=nan)
imp.fit([[1, 2], [3, 6], [4, 8], [nan, 3], [7, nan]])
X_test = [[nan, 2], [6, nan], [nan, 6]]
print(imp.transform(X_test).tolist())

[[1.000072974911173, 2.0], [6.0, 12.000027535212745], [2.9999614476811196, 6.0]]
