In [108]:
import numpy as np
import sklearn
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer 
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

In [109]:
arr = np.array([[5.1, -2.9, 3.3], [ -1.2, 7.8, -6.1], [3.9, 0.4, 2.1], [7.3, -9.9, -4.5]])

In [110]:
arr.mean()

0.4416666666666666

In [111]:
arr.std()

5.274065214698135

In [112]:
arr

array([[ 5.1, -2.9,  3.3],
       [-1.2,  7.8, -6.1],
       [ 3.9,  0.4,  2.1],
       [ 7.3, -9.9, -4.5]])

In [113]:
binarized_scaler = sklearn.preprocessing.Binarizer(threshold=2.0)


In [114]:
binarized_arr = binarized_scaler.transform(arr)

In [115]:
binarized_arr

array([[1., 0., 1.],
       [0., 1., 0.],
       [1., 0., 1.],
       [1., 0., 0.]])

In [116]:
standard_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)

In [117]:
scaled_arr = standard_scaler.fit_transform(arr)

In [118]:
scaled_arr

array([[ 0.42462551, -0.2748757 ,  1.13244172],
       [-1.59434861,  1.40579288, -1.18167831],
       [ 0.04005901,  0.24346134,  0.83702214],
       [ 1.12966409, -1.37437851, -0.78778554]])

In [119]:
minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))

In [120]:
minmaxscaled_arr = minmax_scaler.fit_transform(arr)

In [121]:
normalizer_scaler_l1 = sklearn.preprocessing.Normalizer(norm='l1')
normalizer_scaler_l2 = sklearn.preprocessing.Normalizer(norm='l2')
normalizer_scaler_max = sklearn.preprocessing.Normalizer(norm='max')

In [122]:
normalized_arr_l1 = normalizer_scaler_l1.fit_transform(arr)
normalized_arr_l2 = normalizer_scaler_l2.fit_transform(arr)
normalized_arr_max = normalizer_scaler_max.fit_transform(arr)

In [123]:
print(f" Binarized array \n {binarized_arr} \n Scaled array \n {scaled_arr} \n MinmaxScaled array \n {minmaxscaled_arr} \n L1-normalized array \n {normalized_arr_l1} \n L2-normalized array \n {normalized_arr_l2} \n MAX-normalized array \n {normalized_arr_max}")

 Binarized array 
 [[1. 0. 1.]
 [0. 1. 0.]
 [1. 0. 1.]
 [1. 0. 0.]] 
 Scaled array 
 [[ 0.42462551 -0.2748757   1.13244172]
 [-1.59434861  1.40579288 -1.18167831]
 [ 0.04005901  0.24346134  0.83702214]
 [ 1.12966409 -1.37437851 -0.78778554]] 
 MinmaxScaled array 
 [[0.74117647 0.39548023 1.        ]
 [0.         1.         0.        ]
 [0.6        0.5819209  0.87234043]
 [1.         0.         0.17021277]] 
 L1-normalized array 
 [[ 0.45132743 -0.25663717  0.2920354 ]
 [-0.0794702   0.51655629 -0.40397351]
 [ 0.609375    0.0625      0.328125  ]
 [ 0.33640553 -0.4562212  -0.20737327]] 
 L2-normalized array 
 [[ 0.75765788 -0.43082507  0.49024922]
 [-0.12030718  0.78199664 -0.61156148]
 [ 0.87690281  0.08993875  0.47217844]
 [ 0.55734935 -0.75585734 -0.34357152]] 
 MAX-normalized array 
 [[ 1.         -0.56862745  0.64705882]
 [-0.15384615  1.         -0.78205128]
 [ 1.          0.1025641   0.53846154]
 [ 0.73737374 -1.         -0.45454545]]


In [124]:
labels_list = ['red', 'black', 'red', 'green', 'black', 'yellow',
'white']

In [125]:
label_encoder = sklearn.preprocessing.LabelEncoder()

In [126]:
encoded_labels = label_encoder.fit_transform(labels_list)

In [127]:
encoded_labels

array([2, 0, 2, 1, 0, 4, 3], dtype=int64)

In [128]:
for i in range(len(labels_list)):
    print(f"{labels_list[i]}: {encoded_labels[i]}")

red: 2
black: 0
red: 2
green: 1
black: 0
yellow: 4
white: 3


In [129]:
decoded_labels = label_encoder.inverse_transform(encoded_labels)

In [130]:
print(f"Decoded labels: {decoded_labels}")

Decoded labels: ['red' 'black' 'red' 'green' 'black' 'yellow' 'white']


In [131]:
to_encode = ['green', 'red', 'black']
encoded_elements = label_encoder.transform(to_encode)
print(f"Encoded elements {to_encode}: {encoded_elements}")

Encoded elements ['green', 'red', 'black']: [1 2 0]


In [132]:
to_decode = [3, 0, 4, 1]
decoded_elements = label_encoder.inverse_transform(to_decode)
print(f"Decoded elements {to_decode}: {decoded_elements}")

Decoded elements [3, 0, 4, 1]: ['white' 'black' 'yellow' 'green']


In [133]:
data = np.loadtxt('C:/Users/Arseny/Documents/GitHub/python_solution/labmrzl/data_multivar_nb.txt', delimiter=',')

In [134]:
data.shape

(400, 3)

In [135]:
X, y = data[:, :-1], data[:,-1]

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True)

In [137]:
gnb = GaussianNB()

In [138]:
gnb.fit(X_train,y_train)

In [139]:
y_pred = gnb.predict(X_test)

In [140]:
y_pred

array([2., 1., 0., 2., 0., 3., 3., 0., 3., 3., 2., 3., 3., 1., 3., 1., 2.,
       0., 1., 3., 0., 2., 0., 3., 2., 0., 2., 3., 2., 1., 3., 0., 1., 1.,
       3., 1., 0., 1., 2., 0., 3., 2., 1., 0., 3., 2., 0., 3., 2., 1., 3.,
       0., 2., 0., 2., 1., 3., 2., 1., 0., 1., 3., 3., 1., 2., 2., 2., 1.,
       3., 3., 2., 0., 2., 0., 1., 2., 2., 0., 3., 0.])

In [141]:
y_test

array([2., 1., 0., 2., 0., 3., 3., 0., 3., 3., 2., 3., 3., 1., 2., 1., 2.,
       0., 1., 3., 0., 2., 0., 3., 2., 0., 2., 3., 2., 1., 3., 0., 1., 1.,
       3., 1., 0., 1., 2., 0., 3., 2., 1., 0., 3., 2., 0., 3., 2., 1., 3.,
       0., 2., 0., 2., 1., 3., 2., 1., 0., 1., 3., 3., 1., 2., 2., 2., 1.,
       3., 3., 2., 0., 2., 0., 1., 2., 2., 0., 3., 0.])

In [142]:
accuracy = cross_val_score(gnb, X, y, cv=3, scoring='accuracy').mean()
precision = cross_val_score(gnb, X, y, cv=3, scoring=make_scorer(precision_score, average='weighted')).mean()
recall = cross_val_score(gnb, X, y, cv=3, scoring=make_scorer(recall_score, average='weighted')).mean()
f1 = cross_val_score(gnb, X, y, cv=3, scoring=make_scorer(f1_score, average='weighted')).mean()

In [143]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [144]:
input_file2 = "C:/Users/Arseny/Documents/GitHub/python_solution/labmrzl/income_data.txt"

In [145]:
with open(input_file2, 'r') as f:
    for line in f.readlines():
        if '?' in line:
            continue

In [146]:
data = line[:-1].split(', ')

In [147]:
X_1 = []
Y_1 = []


In [148]:
with open(input_file2, 'r') as f:
    for line in f.readlines():
        if '?' in line:
            continue
        data = line.strip().split(', ')
        
        if data[-1] == '<=50K':
            X_1.append(data[:-1])  
            Y_1.append(0)  
        elif data[-1] == '>50K':
            X_1.append(data[:-1])  
            Y_1.append(1)  

In [149]:
X_1 = np.array(X_1)
Y_1 = np.array(Y_1)

In [150]:
X_1

array([['39', 'State-gov', '77516', ..., '0', '40', 'United-States'],
       ['50', 'Self-emp-not-inc', '83311', ..., '0', '13',
        'United-States'],
       ['38', 'Private', '215646', ..., '0', '40', 'United-States'],
       ...,
       ['58', 'Private', '151910', ..., '0', '40', 'United-States'],
       ['22', 'Private', '201490', ..., '0', '20', 'United-States'],
       ['52', 'Self-emp-inc', '287927', ..., '0', '40', 'United-States']],
      dtype='<U26')

In [151]:
Y_1

array([0, 0, 0, ..., 0, 0, 1])

In [152]:
label_encoders = []
X_encoded = np.empty(X_1.shape)
for i in range(X_1.shape[1]):
   
    if all(item.isdigit() for item in X_1[:, i]): 
        X_encoded[:, i] = X_1[:, i].astype(float)
    else: 
        encoder = sklearn.preprocessing.LabelEncoder()
        X_encoded[:, i] = encoder.fit_transform(X_1[:, i])
        label_encoders.append(encoder)

In [153]:
X_2 = X_encoded
y_2 = Y_1

In [154]:
X_2[0]

array([3.9000e+01, 5.0000e+00, 7.7516e+04, 9.0000e+00, 1.3000e+01,
       4.0000e+00, 0.0000e+00, 1.0000e+00, 4.0000e+00, 1.0000e+00,
       2.1740e+03, 0.0000e+00, 4.0000e+01, 3.8000e+01])

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, shuffle=True)

In [156]:
X_train

array([[3.60000e+01, 2.00000e+00, 3.08945e+05, ..., 0.00000e+00,
        4.00000e+01, 3.80000e+01],
       [3.00000e+01, 1.00000e+00, 1.72822e+05, ..., 0.00000e+00,
        4.00000e+01, 3.80000e+01],
       [7.20000e+01, 1.00000e+00, 2.59762e+05, ..., 0.00000e+00,
        1.00000e+01, 3.80000e+01],
       ...,
       [2.20000e+01, 2.00000e+00, 2.34641e+05, ..., 0.00000e+00,
        4.00000e+01, 3.80000e+01],
       [2.60000e+01, 2.00000e+00, 1.64737e+05, ..., 0.00000e+00,
        4.00000e+01, 3.80000e+01],
       [4.60000e+01, 2.00000e+00, 1.69042e+05, ..., 0.00000e+00,
        2.50000e+01, 6.00000e+00]])

In [157]:
classifier = OneVsOneClassifier(LinearSVC())

In [158]:
classifier.fit(X_train,y_train)

In [159]:
y_pred = classifier.predict(X_test)

In [160]:
accuracy = cross_val_score(classifier, X_2, y_2, cv=3, scoring='accuracy').mean()
precision = cross_val_score(classifier, X_2, y_2, cv=3, scoring=make_scorer(precision_score, average='weighted')).mean()
recall = cross_val_score(classifier, X_2, y_2, cv=3, scoring=make_scorer(recall_score, average='weighted')).mean()
f1 = cross_val_score(classifier,X_2, y_2, cv=3, scoring=make_scorer(f1_score, average='weighted')).mean()

In [161]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.80
Precision: 0.79
Recall: 0.80
F1-score: 0.76
