In [None]:
import pandas as pd
from io import StringIO as SI
import numpy as np


csv_data = \
'''A,B,C,D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,, 8.0
10.0, 11.0, 12.0,'''

csv_data

'A,B,C,D\n1.0, 2.0, 3.0, 4.0\n5.0, 6.0,, 8.0\n10.0, 11.0, 12.0,'

In [None]:
df = pd.read_csv(SI(csv_data))

df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [None]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [None]:
df.values #convert to numpy array

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [None]:
df.dropna(axis = 0) # axis = 0 for rows, axis = 1 for columns #doesn't change the original data

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [None]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [None]:
dfCateg = pd.DataFrame([
                   ['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']
])

dfCateg

Unnamed: 0,0,1,2,3
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [None]:
dfCateg.columns = ['color', 'size', 'price', 'classlabel']

dfCateg

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [None]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}
size_mapping

{'L': 2, 'M': 1, 'XL': 3}

In [None]:
dfCateg['size'] = dfCateg['size'].map(size_mapping)

dfCateg

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [None]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(dfCateg['classlabel']))}

dfCateg['classlabel'] = dfCateg['classlabel'].map(class_mapping)

dfCateg

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [None]:
#partition datasets into test and training

import pandas as pd

df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [None]:
#format column names

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids',
                   'Nonflavanoid phenols','Proanthocyanins','Color intensity', 'Hue','OD280/OD315 of diluted wines','Proline']

df_wine

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [None]:
x, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
x, y

(array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
 

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.3,
                                                    random_state = 0,
                                                    stratify = y)

x_train, x_test, y_train, y_test


(array([[1.362e+01, 4.950e+00, 2.350e+00, ..., 9.100e-01, 2.050e+00,
         5.500e+02],
        [1.376e+01, 1.530e+00, 2.700e+00, ..., 1.250e+00, 3.000e+00,
         1.235e+03],
        [1.373e+01, 1.500e+00, 2.700e+00, ..., 1.190e+00, 2.710e+00,
         1.285e+03],
        ...,
        [1.388e+01, 5.040e+00, 2.230e+00, ..., 5.800e-01, 1.330e+00,
         4.150e+02],
        [1.196e+01, 1.090e+00, 2.300e+00, ..., 9.900e-01, 3.130e+00,
         8.860e+02],
        [1.270e+01, 3.870e+00, 2.400e+00, ..., 1.190e+00, 3.130e+00,
         4.630e+02]]),
 array([[1.377e+01, 1.900e+00, 2.680e+00, 1.710e+01, 1.150e+02, 3.000e+00,
         2.790e+00, 3.900e-01, 1.680e+00, 6.300e+00, 1.130e+00, 2.930e+00,
         1.375e+03],
        [1.217e+01, 1.450e+00, 2.530e+00, 1.900e+01, 1.040e+02, 1.890e+00,
         1.750e+00, 4.500e-01, 1.030e+00, 2.950e+00, 1.450e+00, 2.230e+00,
         3.550e+02],
        [1.439e+01, 1.870e+00, 2.450e+00, 1.460e+01, 9.600e+01, 2.500e+00,
         2.520e+00, 3.000e-0

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

x_train_norm = mms.fit_transform(x_train)
x_test_norm = mms.transform(x_test)

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
x_train_std = stdsc.fit_transform(x_train)
x_test_std = stdsc.transform(x_test)

pd.DataFrame(x_train_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,6.890098e-15,1.683241e-16,3.79966e-15,2.856138e-16,-3.178461e-16,1.662649e-15,1.598184e-16,-1.198862e-15,1.888274e-15,-1.615195e-15,-5.783904e-16,8.631089e-16,-1.334058e-16
std,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057
min,-1.971837,-1.380243,-2.554934,-2.525252,-2.069521,-2.187238,-1.691226,-1.964528,-2.068907,-1.419256,-2.147813,-1.947267,-1.465378
25%,-0.8150037,-0.6275963,-0.5499141,-0.6917901,-0.7889079,-0.8535947,-0.7985839,-0.8092365,-0.6496852,-0.8006399,-0.7436984,-0.8974047,-0.782356
50%,0.0381987,-0.4351583,-0.05565323,-0.09065504,-0.113029,0.1385058,0.09896271,-0.2315907,-0.07775997,-0.1438903,0.04894675,0.2823374,-0.2499217
75%,0.8033481,0.698088,0.7370293,0.5705935,0.5094911,0.8297234,0.8322044,0.6142478,0.561949,0.4704883,0.7510038,0.7946413,0.8610452
max,2.181832,2.947475,3.152379,3.065304,3.693237,2.529306,2.95836,2.161513,3.286008,3.351712,2.109824,1.992422,2.843268
