In [1]:
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data = [
    ['a', 1, 2],
    ['b', 1, 1],
    ['b', 2, 2],
    [np.nan, np.nan, np.nan]
]

X = pd.DataFrame(data)
xt = DataFrameImputer().fit_transform(X)

print('before...')
print(X)
print('after...')
print(xt)


before...
     0    1    2
0    a  1.0  2.0
1    b  1.0  1.0
2    b  2.0  2.0
3  NaN  NaN  NaN
after...
   0         1         2
0  a  1.000000  2.000000
1  b  1.000000  1.000000
2  b  2.000000  2.000000
3  b  1.333333  1.666667


In [23]:
data = [
    [ 1, 2],
    [ 1, 1],
    [ 20, 2],
    [ 2, 3],
    [4, 4],
    [np.nan, np.nan]
]

df = pd.DataFrame(data)

In [26]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/Users/nnair101/Documents/Python/abalone_data.csv')

In [28]:
#finding numeric features
df_num = df.select_dtypes(include=[np.number])
col_num = df.select_dtypes(include=[np.number]).columns.tolist()
col_num

['Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight',
 'Rings']

In [29]:
# finding unique values
df_num.T.apply(lambda x: x.nunique(), axis=1)

Length             134
Diameter           111
Height              51
Whole weight      2429
Shucked weight    1515
Viscera weight     880
Shell weight       925
Rings               28
dtype: int64

In [None]:
# removing outliers for continuous variable 

In [20]:
Q1 = df[1].quantile(0.25)
Q1

2.0

In [17]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(Q1)

0    1.0
1    2.0
Name: 0.25, dtype: float64


In [21]:
# for continuous variables 
for col in df.columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df[col][df[col] <= (Q1 - 1.5 * IQR)] = (Q1 - 1.5 * IQR)
    df[col][df[col] >= (Q3 + 1.5 * IQR)] = (Q3 + 1.5 * IQR)

In [24]:
# for categorical variables


[0, 1]

In [22]:
df

Unnamed: 0,0,1
0,1.0,2.0
1,1.0,1.0
2,8.5,2.0
3,2.0,3.0
4,4.0,4.0
5,,


In [3]:
X.dtypes

0     object
1    float64
2    float64
dtype: object