# 1. Fully connected neural network

In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from keras.utils import np_utils


from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score, classification_report, confusion_matrix


In [162]:
df = pd.read_csv('drug200.csv')

In [163]:
df['Sex'] = pd.Categorical(df['Sex'])
df['BP'] = pd.Categorical(df['BP'])
df['Cholesterol'] = pd.Categorical(df['Cholesterol'])
df['Drug'] = pd.Categorical(df['Drug'])
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [164]:
for column in df.select_dtypes(include='category').columns:
    if column=='Drug':
        continue
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)],axis=1)
    df.drop([column],axis=1, inplace=True)

In [165]:
Y=df['Drug'].cat.codes
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [166]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Drug',axis=1), dummy_y, test_size=0.2, random_state=42)

In [167]:
for column in df.select_dtypes(include=['int64', 'float64']).columns:
    scale = StandardScaler().fit(X_train[[column]])
    X_train[[column]] = scale.transform(X_train[[column]])
    X_test[[column]] = scale.transform(X_test[[column]])

In [168]:
model = Sequential()
model.add(Dense(32, input_dim=9, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(5, activation='softmax'))

In [169]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [170]:
model.fit(X_train, y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f231c546e50>

In [171]:
pred = model.predict(X_test)
pred

array([[0.27387172, 0.1684021 , 0.0681873 , 0.12414631, 0.3653926 ],
       [0.48998672, 0.08895952, 0.02025304, 0.10800317, 0.29279754],
       [0.07679363, 0.11225224, 0.02348699, 0.0434214 , 0.74404573],
       [0.33614996, 0.12480108, 0.08417764, 0.1681133 , 0.286758  ],
       [0.9115796 , 0.01152067, 0.00186362, 0.04490192, 0.03013421],
       [0.5658961 , 0.08734766, 0.0624288 , 0.14480363, 0.13952376],
       [0.8307367 , 0.02693978, 0.00369932, 0.07280026, 0.06582397],
       [0.21920843, 0.12462015, 0.02433743, 0.07620132, 0.55563265],
       [0.2844744 , 0.17773117, 0.10910533, 0.15153931, 0.27714974],
       [0.19008805, 0.15016937, 0.07452508, 0.11055429, 0.4746632 ],
       [0.4862642 , 0.1181571 , 0.04509943, 0.13536642, 0.21511279],
       [0.28424355, 0.11612594, 0.03005511, 0.10402436, 0.46555105],
       [0.686741  , 0.0634446 , 0.01958595, 0.10497835, 0.12525007],
       [0.5096785 , 0.08728599, 0.02038228, 0.11688644, 0.26576677],
       [0.21940872, 0.15504624, 0.

In [172]:
y_pred = np.array([i.argmax() for i in pred])
y_pred

array([4, 0, 4, 0, 0, 0, 0, 4, 0, 4, 0, 4, 0, 0, 2, 0, 2, 4, 4, 0, 0, 0,
       4, 0, 0, 0, 4, 4, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0])

In [173]:
y_test = np.array([i.argmax() for i in y_test])
y_test

array([4, 0, 4, 3, 0, 0, 0, 4, 1, 4, 1, 4, 0, 1, 2, 0, 2, 4, 3, 0, 2, 4,
       4, 0, 0, 0, 3, 4, 0, 4, 0, 3, 3, 0, 1, 0, 4, 1, 0, 1])

In [177]:
f1_score(y_test, y_pred, average='macro')

0.44547189819724287

In [178]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      1.00      0.73        15
           1       0.00      0.00      0.00         6
           2       1.00      0.67      0.80         3
           3       0.00      0.00      0.00         5
           4       0.67      0.73      0.70        11

    accuracy                           0.62        40
   macro avg       0.45      0.48      0.45        40
weighted avg       0.47      0.62      0.53        40



In [179]:
print(confusion_matrix(y_test, y_pred))

[[15  0  0  0  0]
 [ 5  0  0  0  1]
 [ 1  0  2  0  0]
 [ 2  0  0  0  3]
 [ 3  0  0  0  8]]


# 2. CNN - [lab4_2.ipynb](lab4_2.ipynb) (Done via google collab)

To run it you should download https://www.kaggle.com/alxmamaev/flowers-recognition and unpack to google drive folder that contains .ipynb file.

Then run all cells and wait for your entire life - images are loading. 

You can turn DEBUG = True to ensure you are not wasting your time and images are actually loading. But your browser may be incapable of renderingh such amount of output, so clear the output of loading cell time to time.

# 3. RNN

In [92]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalMaxPooling1D, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [93]:
data = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=['target', 'headline'])

In [94]:
data['target'] = pd.Categorical(data['target'])
data['target'] = data['target'].cat.codes

In [95]:
text = data['headline'].values
labels = data['target'].values
text_train, text_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)
print(text_train.shape, text_test.shape)

(1600,) (400,)


In [96]:
data.headline.str.len().max()

910

In [97]:
vocab_size = 5000
maxlen = 135
embedding_size = 32

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
x_test = tokenizer.texts_to_sequences(text_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [98]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(SimpleRNN(32, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 135, 32)           160000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 135, 32)           2080      
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 32)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 1)                 33        
Total params: 162,113
Trainable params: 162,113
Non-trainable params: 0
_________________________________________________________________


In [99]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=512, validation_data=(x_test, y_test), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [105]:
prediction = model.predict(x_test)>0.5
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 85.0%
