In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
sns.set_theme(style="whitegrid")

In [3]:
data = [
    {'age':4, 'height':96.0},
    {'age':1, 'height':73.9},
    {'age':3, 'height':88.9},
    {'age':2, 'height':81.6}
]

In [4]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
data_transformed = dv.fit_transform(data)
print(data_transformed)

[[ 4.  96. ]
 [ 1.  73.9]
 [ 3.  88.9]
 [ 2.  81.6]]


In [5]:
data_transformed.shape

(4, 2)

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
heart_data = pd.read_csv("heart_disease_uci.csv")
heart_data.head(2)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2


In [8]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [9]:
heart_data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [10]:
heart_data['thalch'].isnull().sum()

np.int64(55)

In [11]:
print(f"Unique values in ca : {heart_data.ca.unique()}")
print(f"Unique values in thal : {heart_data.thal.unique()}")

Unique values in ca : [ 0.  3.  2.  1. nan]
Unique values in thal : ['fixed defect' 'normal' 'reversable defect' nan]


In [12]:
heart_data['ca'].value_counts()

ca
0.0    181
1.0     67
2.0     41
3.0     20
Name: count, dtype: int64

In [13]:
heart_data.replace('?',np.nan,inplace=True)

In [14]:
heart_data.loc[heart_data.ca == "?", ["ca","thal"]].count()

ca      0
thal    0
dtype: int64

In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
heart_data_imputed = imputer.fit_transform(heart_data)
heart_data_imputed.info()

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Male'

In [22]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean',add_indicator=True)
heart_data_imputed_with_indicator = imputer.fit_transform(heart_data)
heart_data_imputed_with_indicator.shape


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Male'

In [None]:
abalone_data = pd.read_csv("abalone.csv")
abalone_data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
abalone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [22]:
abalone_data['Sex'].unique()

array(['M', 'F', 'I'], dtype=object)

In [None]:
abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})
abalone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB


  abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})


In [None]:
y = abalone_data.pop('Rings')
print("The dataframe object after deleting the column")
abalone_data.info()

The dataframe object after deleting the column
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 261.2 KB


In [None]:
abalone_data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [26]:
y.head()

0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64

In [None]:
abalone_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sex,4177.0,1.95547,0.827815,1.0,1.0,2.0,3.0,3.0
Length,4177.0,0.523992,0.120093,0.075,0.45,0.545,0.615,0.815
Diameter,4177.0,0.407881,0.09924,0.055,0.35,0.425,0.48,0.65
Height,4177.0,0.139516,0.041827,0.0,0.115,0.14,0.165,1.13
Whole weight,4177.0,0.828742,0.490389,0.002,0.4415,0.7995,1.153,2.8255
Shucked weight,4177.0,0.359367,0.221963,0.001,0.186,0.336,0.502,1.488
Viscera weight,4177.0,0.180594,0.109614,0.0005,0.0935,0.171,0.253,0.76
Shell weight,4177.0,0.238831,0.139203,0.0015,0.13,0.234,0.329,1.005


In [28]:
x = np.array([4, 2, 5, -2, -100]).reshape(-1,1)
x
#.reshape(-1, 1):
# Reshapes the array into a column vector with one value per row.
# -1 tells NumPy to infer the number of rows automatically based on the size of the array.

array([[   4],
       [   2],
       [   5],
       [  -2],
       [-100]])

In [29]:
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler()
x_new = mas.fit_transform(x)
x_new
#scales each value in 'x' by dividing it by the maxm abs value in the dataset. 
# the formula used by 'MaxAbsScaler' is : xnew = x/max(|x|), where max(|x|) is the largest abs value in the dataset.

array([[ 0.04],
       [ 0.02],
       [ 0.05],
       [-0.02],
       [-1.  ]])

In [30]:
# MinmaxScaler = Xnew = (Xold - Xmin)/(Xmax - Xmin)
# MinMaxScaler ek scaler hai jo data ko normalize karta hai, yani values ko ek fixed range (default: [1]) mein scale karta hai.
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X = abalone_data
X_normalized = mms.fit_transform(X)
X_normalized[:5]

array([[0.        , 0.51351351, 0.5210084 , 0.0840708 , 0.18133522,
        0.15030262, 0.1323239 , 0.14798206],
       [0.        , 0.37162162, 0.35294118, 0.07964602, 0.07915707,
        0.06624075, 0.06319947, 0.06826109],
       [0.5       , 0.61486486, 0.61344538, 0.11946903, 0.23906499,
        0.17182246, 0.18564845, 0.2077728 ],
       [0.        , 0.49324324, 0.5210084 , 0.11061947, 0.18204356,
        0.14425017, 0.14944042, 0.15296462],
       [1.        , 0.34459459, 0.33613445, 0.07079646, 0.07189658,
        0.0595158 , 0.05134957, 0.0533134 ]])

In [32]:
# Standardscaler() = Xnew = (Xold - mean of X (feature) values)/std. deviation of X (feature) values
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_standardized = ss.fit_transform(X)
x_standardized

array([[-1.15434629, -0.57455813, -0.43214879, ..., -0.60768536,
        -0.72621157, -0.63821689],
       [-1.15434629, -1.44898585, -1.439929  , ..., -1.17090984,
        -1.20522124, -1.21298732],
       [ 0.05379815,  0.05003309,  0.12213032, ..., -0.4634999 ,
        -0.35668983, -0.20713907],
       ...,
       [-1.15434629,  0.6329849 ,  0.67640943, ...,  0.74855917,
         0.97541324,  0.49695471],
       [ 0.05379815,  0.84118198,  0.77718745, ...,  0.77334105,
         0.73362741,  0.41073914],
       [-1.15434629,  1.54905203,  1.48263359, ...,  2.64099341,
         1.78744868,  1.84048058]], shape=(4177, 8))

In [33]:
wine_data = pd.read_csv("winequality-red.csv")

In [None]:
wine_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [36]:
from sklearn.preprocessing import FunctionTransformer  
transformer = FunctionTransformer(np.log1p, validate=True)
# FunctionTransformer ek utility hai jo ek custom transformation function apply karta hai. Yahan, hum np.log1p ka use kar rahe hain.
# np.log1p ka matlab hai log transformation with a shift of 1: log1p(x) = log(x+1)
# Ye transformation useful hota hai jab data mein zero ya negative values ho, kyunki log(0) undefined hota hai.
# validate=True ensures ki input ek valid array ya DataFrame ho.
wine_data_transformed = transformer.transform(np.array(wine_data))
# np.array(wine_data) ensures ki data ek NumPy array ke form mein ho.
# Har value ko log transformation ke formula se scale kiya jata hai: 
# transformed value = log(original value + 1)
pd.DataFrame(wine_data_transformed, columns=wine_data.columns).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,2.215842,0.1781,1.722767,2.091864,2.186051,2.322388,2.827314
volatile acidity,1599.0,0.417173,0.114926,0.113329,0.329304,0.41871,0.494696,0.947789
citric acid,1599.0,0.228147,0.152423,0.0,0.086178,0.231112,0.350657,0.693147
residual sugar,1599.0,1.218131,0.269969,0.641854,1.064711,1.163151,1.280934,2.80336
chlorides,1599.0,0.083038,0.038991,0.011929,0.067659,0.076035,0.086178,0.476855
free sulfur dioxide,1599.0,2.639013,0.62379,0.693147,2.079442,2.70805,3.091042,4.290459
total sulfur dioxide,1599.0,3.63475,0.682575,1.94591,3.135494,3.663562,4.143135,5.669881
density,1599.0,0.691519,0.000945,0.68817,0.690945,0.691521,0.692064,0.69499
pH,1599.0,1.460557,0.03576,1.319086,1.437463,1.460938,1.481605,1.611436
sulphates,1599.0,0.501073,0.093731,0.285179,0.438255,0.482426,0.548121,1.098612


In [37]:
from sklearn.preprocessing import PolynomialFeatures
wine_data_copy = wine_data.copy()
wine_data = wine_data.drop(['quality'], axis=1)
print("Number of features before transformations =",wine_data.shape)

# Let me fit a polynomial of degree 2 to wine_data
poly = PolynomialFeatures(degree=2)
poly_wine_data = poly.fit_transform(wine_data)
print("Number of features after transformations =",poly_wine_data.shape)

Number of features before transformations = (1599, 11)
Number of features after transformations = (1599, 78)


In [None]:
from sklearn.preprocessing import KBinsDiscretizer
#KBinsDiscretizer ko import kiya gaya hai, jo ek preprocessing tool hai. Iska kaam continuous numerical data ko discrete bins (categories) mein convert karna hota hai.
wine_data = wine_data_copy.copy()

#transform the data with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
# n_bins=10: Data ko 10 bins (categories) mein divide karega.
# encode='onehot': Binning ke baad result ko one-hot encoding format mein represent karega. One-hot encoding ka matlab hai ki har bin ko binary vector ke form mein represent kiya jata hai.
X = np.array(wine_data['chlorides']).reshape(-1,1)
X_binned = enc.fit_transform(X) 
X_binned.toarray()[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [None]:
from sklearn.datasets import load_iris
iris_data = load_iris()
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

#Add the target column 
iris_df['species'] = iris_data.target 

#Map target number to the species name
iris_df['species'] = iris_df['species'].map({0:'setosa', 1:'versicolor', 2:'virginica'})

iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [57]:
iris_df['species'].nunique()

3

In [58]:
iris_df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [68]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder(categories='auto')
y = iris_df['species']
y = ohc.fit_transform(y.values.reshape(-1,1))
print(y)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 150 stored elements and shape (150, 3)>
  Coords	Values
  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (125, 2)	1.0
  (126, 2)	1.0
  (127, 2)	1.0
  (128, 2)	1.0
  (129, 2)	1.0
  (130, 2)	1.0
  (131, 2)	1.0
  (132, 2)	1.0
  (133, 2)	1.0
  (134, 2)	1.0
  (135, 2)	1.0
  (136, 2)	1.0
  (137, 2)	1.0
  (138, 2)	1.0
  (139, 2)	1.0
  (140, 2)	1.0
  (141, 2)	1.0
  (142, 2)	1.0
  (143, 2)	1.0
  (144, 2)	1.0
  (145, 2)	1.0
  (146, 2)	1.0
  (147, 2)	1.0
  (148, 2)	1.0
  (149, 2)	1.0


In [69]:
from sklearn.preprocessing import LabelEncoder
y = np.array(iris_df['species'])
lenc = LabelEncoder()
y_integer = lenc.fit_transform(y)
y_integer

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [70]:
# MultipleLabelBinarizer 
movie_genres = [
    {'action', 'comedy'},
    {'comedy'},
    {'action','thriller'},
    {'science-fiction','action','thriller'}
]

In [71]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform(movie_genres)

array([[1, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 1],
       [1, 0, 1, 1]])

In [72]:
x = [
    [20.0, 'Male',],
    [11.2, 'Female',],
    [15.6, 'Female',],
    [13.0, 'Male',],
    [18.6, 'Male',],
    [16.4,'Female',]
]
x = np.array(x)

In [73]:
from sklearn.compose import ColumnTransformer
#ColumnTransformer: Allows you to apply different preprocessing techniques to different columns of your dataset.
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder
#OneHotEncoder: Converts categorical data into one-hot encoded vectors (binary representation for each category).
ct = ColumnTransformer(
    [
        ('scaler', MaxAbsScaler(),[0]),
        ('pass', 'passthrough',[0]),
        ('encoder', OneHotEncoder(),[1])
    ]
)
# Apply MaxAbsScaler to column 0
# Pass column 0 without modification
# Apply OneHotEncoder to column 1
ct.fit_transform(x)

array([['1.0', '20.0', '0.0', '1.0'],
       ['0.5599999999999999', '11.2', '1.0', '0.0'],
       ['0.78', '15.6', '1.0', '0.0'],
       ['0.65', '13.0', '0.0', '1.0'],
       ['0.93', '18.6', '0.0', '1.0'],
       ['0.82', '16.4', '1.0', '0.0']], dtype='<U32')

In [16]:
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 

num_pipeline = Pipeline([
    ('selector', ColumnTransformer([('select_first_4','passthrough',slice(0,4))])),
    ('imputer', SimpleImputer(strategy='median')), 
    ('std_scaler', StandardScaler()),
     
])
cat_pipeline = ColumnTransformer([('label_binarizer', LabelBinarizer(),[4]),
                                  ])
full_pipeline = FeatureUnion(transformer_list=
                             [
                                 ("num_pipeline", num_pipeline),
                                 ("cat_pipeline", cat_pipeline)
                             ])

In [17]:
from sklearn import set_config
set_config(display='diagram')
full_pipeline

In [None]:
``