In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchmetrics import F1Score, Accuracy
import time
from torch.utils.tensorboard import SummaryWriter

## Assignment 2 in ML
### Ostapovich Oleg

#### Section 1: Data Reading

In [2]:
data_train = pd.read_csv('data/stream_quality_train.csv')
data_test = pd.read_csv('data/stream_quality_test.csv')

In [3]:
print(data_train.shape, data_test.shape)

(760552, 12) (129978, 12)


#### Section 2: Exploration and preprocessing

In [4]:
data_train.head()

Unnamed: 0,fps_mean,fps_std,rtt_mean,rtt_std,dropped_frames_mean,dropped_frames_std,dropped_frames_max,bitrate_mean,bitrate_std,packet_loss_rate,packet_loss_std,y
0,0.744824,0.025512,0.786908,0.013918,0.00323,0.0,0.00323,0.066147,0.01039,0.00025,0.0,1.0
1,0.744824,0.025512,0.810122,0.055803,0.00323,0.0,0.00323,0.077022,0.041797,0.031492,0.153055,1.0
2,0.734408,0.076537,0.79167,0.031781,0.00323,0.0,0.00323,0.069172,0.01707,0.00025,0.0,1.0
3,0.750031,0.0,0.826193,0.015573,0.00323,0.0,0.00323,0.061703,0.021221,0.003121,0.014067,1.0
4,0.703162,0.159856,0.816669,0.005438,0.00323,0.0,0.00323,0.040957,0.062898,0.003121,0.014067,1.0


According Pandas Profiling, datasets are free from outliers, data is normalized, all categorical features are encoded. That's why the only thing left is separate Y column from other data.

In [5]:
y_train_df = data_train['y']
x_train_df = data_train.drop(['y'], axis=1)

y_test = data_test['y']
x_test = data_test.drop(['y'], axis=1)

Also for future tasks we need to find correlation

In [6]:
abs(data_train.corr().y)

fps_mean               0.182148
fps_std                0.051231
rtt_mean               0.237470
rtt_std                0.022025
dropped_frames_mean    0.011221
dropped_frames_std     0.009314
dropped_frames_max     0.005493
bitrate_mean           0.630298
bitrate_std            0.196570
packet_loss_rate       0.037425
packet_loss_std        0.052503
y                      1.000000
Name: y, dtype: float64

Here we can see that 'bitrate_mean', 'bitrate_std', 'rtt_mean', 'fps_mean' columns have the most correlation coefficient to Y

In [8]:
# removing all unnecessary features
selected_features = ['bitrate_mean', 'bitrate_std', 'rtt_mean', 'fps_mean']
x_train_df, y_train_df, x_test, y_test = x_train_df[selected_features], y_train_df, x_test[selected_features], y_test

In [9]:
# data splitting
print("before split", x_train_df.shape, y_train_df.shape, x_test.shape, y_test.shape)
x_train, x_val, y_train, y_val = train_test_split(x_train_df, y_train_df, train_size=0.8)
print("after split", x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

before split (760552, 4) (760552,) (129978, 4) (129978,)
after split (608441, 4) (608441,) (152111, 4) (152111,) (129978, 4) (129978,)


In [10]:
class Dset(Dataset):
    def __init__(self, values, labels):
        self.x = torch.tensor(values.values, dtype= torch.float32)
        # self.y = torch.from_numpy(labels.values).type(torch.LongTensor)
        self.y = list(map(int, labels.values))
    def __len__(self):
        return (len(self.y))

    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [11]:
batch_size = 100
# Adding data to loader
trainloader = torch.utils.data.DataLoader(Dset(x_train,y_train), batch_size=batch_size,
                                          shuffle=True,
                                          num_workers=0
                                          )
valloader = torch.utils.data.DataLoader(Dset(x_val,y_val), batch_size=batch_size,
                                          shuffle=True,
                                        num_workers=0
                                        )
testloader = torch.utils.data.DataLoader(Dset(x_test,y_test), batch_size=batch_size,
                                          shuffle=True,
                                         num_workers=0
                                         )

In [29]:
def train_evaluate(net, optimizer, writer, epochs):
    starttime = time.time()
    criterion = nn.CrossEntropyLoss()
    # criterion = nn.NLLLoss()
    accuracy_func = Accuracy(num_classes=3, average='weighted').to(device)
    f1_score_func = F1Score(num_classes=3, average='weighted').to(device)
    for epoch in range(epochs):

        running_loss = 0.0
        f1 = 0
        accuracy = 0
        for data in trainloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            _, pred = torch.max(outputs.data, 1)
            accuracy += accuracy_func(pred, labels) * 100
            f1 += f1_score_func(pred, labels) * 100

        running_loss /= len(trainloader)
        accuracy /= len(trainloader)
        f1 /= len(trainloader)
        writer.add_scalar('Training_Loss', running_loss, epoch)
        writer.add_scalar('Training_Accuracy', accuracy, epoch)
        writer.add_scalar('Training_F1', f1, epoch)

        print('Epoch {} - train loss:{}, accuracy:{}, f1_score:{}, time passed {}s'.format(epoch+1, running_loss, accuracy, f1, int(time.time()-starttime)))

        val_loss = 0.0
        val_accuracy = 0
        val_f1_score = 0
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                _, pred = torch.max(outputs.data, 1)
                val_loss += loss.item()
                val_accuracy += accuracy_func(pred, labels) * 100
                val_f1_score += f1_score_func(pred, labels) * 100
            val_loss /= len(valloader)
            val_accuracy /= len(valloader)
            val_f1_score /= len(valloader)
            writer.add_scalar('Val_Loss', val_loss, epoch)
            writer.add_scalar('Val_Accuracy', val_accuracy, epoch)
            writer.add_scalar('Val_F1', val_f1_score, epoch)

        print('Epoch {} - val loss:{}, accuracy:{}, f1_score:{}, time passed {}s'.format(epoch+1, val_loss, val_accuracy, val_f1_score, int(time.time()-starttime)))

In [68]:
def test_evaluate(net, optimizer):
    val_loss = 0.0
    val_accuracy = 0
    val_f1_score = 0
    criterion = nn.CrossEntropyLoss()
    accuracy_func = Accuracy(num_classes=3, average='weighted').to(device)
    f1_score_func = F1Score(num_classes=3, average='weighted').to(device)
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            _, pred = torch.max(outputs.data, 1)
            val_loss += loss.item()
            val_accuracy += accuracy_func(pred, labels) * 100
            val_f1_score += f1_score_func(pred, labels) * 100
        val_loss /= len(testloader)
        val_accuracy /= len(testloader)
        val_f1_score /= len(testloader)
    print('Test evaluation - test loss:{}, accuracy:{}, f1_score:{}'.format(val_loss, val_accuracy, val_f1_score))

### Task 1

#### Section 3.1: Machine learning or Deep learning model defining, training and hyper-parameters turning

In this section two base models and one ensemble model were created

In [21]:
from torchsummary import summary
import torch
import torch.nn as nn
device = torch.device("cuda:0")

In [31]:
class StartModel(nn.Module):
    def __init__(self):
        super(StartModel, self).__init__()
        self.network=nn.Sequential(
            nn.Linear(in_features=4, out_features=3, bias=True))
            # nn.LogSoftmax(dim=1))

    def forward(self, x):
        return self.network(x)

startmodel = StartModel().to(device)

In [55]:
class ImprovedModel(nn.Module):
    def __init__(self):
        super(ImprovedModel, self).__init__()
        self.network=nn.Sequential(
            nn.Linear(in_features=4, out_features=200, bias=True),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=200, out_features=100, bias=True),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=100, out_features=50, bias=True),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=50, out_features=3, bias=True),
            nn.LogSoftmax(dim=1))

    def forward(self, x):
        return self.network(x)

improvedmodel = ImprovedModel().to(device)

In [58]:
class Ensembling(nn.Module):
    def __init__(self, modelA, modelB):
        super(Ensembling, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.classifier = nn.Linear(6, 3)

    def forward(self, x):
        x1 = self.modelA(x)
        x2 = self.modelB(x)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(F.relu(x))
        return x
ensembling = Ensembling(startmodel, improvedmodel).to(device)

#### Section 4.1: Model performance evaluation

In [33]:
optimizer = torch.optim.Adadelta(startmodel.parameters())
writer = SummaryWriter('runs1/BaseModelbase2')
train_evaluate(startmodel, optimizer, writer, 5)

Epoch 1 - train loss:0.5271992166363615, accuracy:81.0798568725586, f1_score:80.17546081542969, time passed 79s
Epoch 1 - val loss:0.44192712858769645, accuracy:84.21276092529297, f1_score:84.09778594970703, time passed 93s
Epoch 2 - train loss:0.4349664609968026, accuracy:84.39073181152344, f1_score:84.29925537109375, time passed 164s
Epoch 2 - val loss:0.4330424742379107, accuracy:84.37444305419922, f1_score:84.27202606201172, time passed 186s
Epoch 3 - train loss:0.4308922117253063, accuracy:84.55455780029297, f1_score:84.47653198242188, time passed 248s
Epoch 3 - val loss:0.43159701849447757, accuracy:84.45854187011719, f1_score:84.34502410888672, time passed 262s
Epoch 4 - train loss:0.4299958995648261, accuracy:84.61475372314453, f1_score:84.53621673583984, time passed 322s
Epoch 4 - val loss:0.43106506816778484, accuracy:84.52024841308594, f1_score:84.45096588134766, time passed 335s
Epoch 5 - train loss:0.42980250601114167, accuracy:84.61400604248047, f1_score:84.54107666015625

In [69]:
# testing first model
test_evaluate(startmodel, optimizer)

Test evaluation - test loss:0.5310734307307463, accuracy:78.1305923461914, f1_score:76.17174530029297


In [57]:
optimizer = torch.optim.Adadelta(improvedmodel.parameters())
writer = SummaryWriter('runs1/ImprovedModel')
train_evaluate(improvedmodel, optimizer, writer, 5)

Epoch 1 - train loss:0.4250807216080271, accuracy:84.07757568359375, f1_score:84.06288146972656, time passed 70s
Epoch 1 - val loss:0.4297611152804634, accuracy:83.77128601074219, f1_score:83.61913299560547, time passed 83s
Epoch 2 - train loss:0.42404856910789807, accuracy:84.2015609741211, f1_score:84.18572998046875, time passed 150s
Epoch 2 - val loss:0.42351916195450257, accuracy:84.26537322998047, f1_score:84.23945617675781, time passed 162s
Epoch 3 - train loss:0.42249655151034143, accuracy:84.23336791992188, f1_score:84.21257781982422, time passed 229s
Epoch 3 - val loss:0.42559631884528837, accuracy:84.10763549804688, f1_score:84.15430450439453, time passed 242s
Epoch 4 - train loss:0.4222891642578136, accuracy:84.2564697265625, f1_score:84.2392807006836, time passed 309s
Epoch 4 - val loss:0.42187589666415765, accuracy:84.21466827392578, f1_score:84.18081665039062, time passed 321s
Epoch 5 - train loss:0.422060643994818, accuracy:84.24461364746094, f1_score:84.22643280029297, 

In [70]:
# testing second model
test_evaluate(improvedmodel, optimizer)

Test evaluation - test loss:0.42100789078153095, accuracy:84.31564331054688, f1_score:84.35420989990234


In [60]:
optimizer = torch.optim.Adadelta(ensembling.parameters())
writer = SummaryWriter('runs1/Ensembling1')
train_evaluate(ensembling, optimizer, writer, 5)

Epoch 1 - train loss:0.4117550442452889, accuracy:84.95167541503906, f1_score:84.87709045410156, time passed 73s
Epoch 1 - val loss:0.41250681546216883, accuracy:85.0295639038086, f1_score:84.96405792236328, time passed 86s
Epoch 2 - train loss:0.4116206688673259, accuracy:84.96923828125, f1_score:84.89723205566406, time passed 167s
Epoch 2 - val loss:0.41269609624821, accuracy:85.01170349121094, f1_score:84.95594787597656, time passed 185s
Epoch 3 - train loss:0.41153104872172647, accuracy:84.96698760986328, f1_score:84.89115142822266, time passed 256s
Epoch 3 - val loss:0.41389455632569755, accuracy:84.98285675048828, f1_score:84.95818328857422, time passed 269s
Epoch 4 - train loss:0.41142231127524237, accuracy:84.98345947265625, f1_score:84.90711212158203, time passed 340s
Epoch 4 - val loss:0.41243641238057505, accuracy:84.91064453125, f1_score:84.79624938964844, time passed 353s
Epoch 5 - train loss:0.41136383243094926, accuracy:84.9690170288086, f1_score:84.89311981201172, time 

In [72]:
# testing ensembling
test_evaluate(ensembling, optimizer)

Test evaluation - test loss:0.406222931341483, accuracy:84.96727752685547, f1_score:84.85663604736328


### Task 2

#### Section 3.2: Machine learning or Deep learning model defining, training and hyper-parameters turning

In [16]:
from sklearn.ensemble import RandomForestClassifier
randomforestfunc = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=9)
randomforest = randomforestfunc.fit(x_train,y_train)

#### Section 4.2: Model performance evaluation

In [17]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = randomforest.predict(x_test) # model testing
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test,y_pred,average='weighted'))

Accuracy: 0.8517749157549739
F1-score: 0.8513006286481531


In [26]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

fig = plt.figure(figsize=(20, 15))
plot_tree(randomforest.estimators_[0], # creating image of first tree
          feature_names=x_train[selected_features].columns,
          filled=True, impurity=True,
          rounded=True)
writer = SummaryWriter('runs1/RandomForest') # write tree image to tensorboard
writer.add_figure('First tree',fig)
plt.show()

#### Section 5: Conclusion and possible improvements

### Task 1.1
The goal is to implement deep learning model and improve it's performance with ensembling learning.

To do this, two models was created. First one is very simple with one FC layer. Second is much more complicated. This two models was connected together in ensemble model.

As a result, accuracy and f1_score was close to 85%. Multiple approaches was used to achieve 90%. These attempts consist of creating different DNN structures with combination of FC, ReLU, BatchNormalization and Dropout layers as well as using different optimization and loss functions. Tuning of hyperparameters, such as learning rate, batch size and number of epochs also have no effect on results. Some of the best results are presented in tensorboard.

According to tests, all used methods have no effect on metrics so may be it is possible to say that the highest model efficiency on this data was achieved.

### Task 2.2
The goal is to select, train and evaluate an appropriate ML model that will provide an ability to understand why a specific prediction was achieved.

Decision tree was used to provide this goal. To improve model performance was used 100 of trees in ensemble. RandomForest parameters criterion='gini' and max_depth=9 was chosen due to a variety of tests, according to the results of which, the model got 85% accuracy and f1_score.

Example of decision tree from RandomForest presented in tensorboard.


In [69]:
%load_ext tensorboard
%tensorboard --logdir runs1

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Launching TensorBoard...