In [116]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=200)


In [117]:
model = LogisticRegression(C=10e8)
model.fit(X_train, y_train)
accuracy_score(y_train, model.predict(X_train)), \
accuracy_score(y_test, model.predict(X_test))



(0.975609756097561, 0.955)

In [128]:
from sklearn.neural_network import MLPClassifier
?MLPClassifier

In [127]:
model = make_pipeline(StandardScaler(),
                      MLPClassifier((100,100),alpha=0,max_iter=1000))
model.fit(X_train, y_train)
accuracy_score(y_train, model.predict(X_train)), \
accuracy_score(y_test, model.predict(X_test))

(1.0, 0.975)

In [115]:
X_test.shape

(200, 30)

In [120]:
X_train.round()

array([[ 20.,  22., 130., ...,   0.,   0.,   0.],
       [ 17.,  25., 116., ...,   0.,   0.,   0.],
       [ 12.,  18.,  75., ...,   0.,   0.,   0.],
       ...,
       [ 13.,  17.,  85., ...,   0.,   0.,   0.],
       [ 13.,  20.,  87., ...,   0.,   0.,   0.],
       [ 12.,  22.,  77., ...,   0.,   0.,   0.]])

In [133]:
from sklearn.datasets import load_digits

data = load_digits()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=400)


Przetestować różne warianty sieci - różne wielkości, różne funkcje aktywacji + ew. inne czynniki

In [136]:
model.steps[-1][1].coefs_

[array([[ 0.02291611, -0.16298693, -0.00896089, ...,  0.04233328,
          0.02535449, -0.08959011],
        [-0.18619987,  0.08777913,  0.18635325, ..., -0.06692502,
         -0.10192085, -0.19063453],
        [ 0.11453958, -0.10001044,  0.18795714, ..., -0.16203433,
         -0.03111186,  0.10699505],
        ...,
        [-0.0678079 , -0.03121434,  0.044178  , ..., -0.15903743,
          0.03860881,  0.1046871 ],
        [ 0.04053296, -0.05922321,  0.03091512, ..., -0.12438205,
         -0.16988246,  0.01194878],
        [-0.3065712 ,  0.00187153,  0.1121093 , ..., -0.04029514,
         -0.17433568,  0.09362454]]),
 array([[ 0.04063633,  0.13391882,  0.09765101, ..., -0.08259058,
         -0.04781418, -0.0094138 ],
        [ 0.0980422 , -0.17992812, -0.02875861, ..., -0.16216962,
          0.07395734,  0.18581597],
        [ 0.16473524,  0.0130511 , -0.10467707, ...,  0.01559749,
         -0.17677697,  0.12438757],
        ...,
        [ 0.00189682, -0.27786268, -0.15457847, ...,  

In [143]:
for h_dim in [1,3,5,10,25,50,100,250,500,1000]:
    
    model = make_pipeline(StandardScaler(),
                      MLPClassifier(h_dim,
                                    activation="relu",
                                    alpha=0,
                                    early_stopping=True))
    model.fit(X_train, y_train)
    print(
        h_dim,
        accuracy_score(y_train, model.predict(X_train)), 
        accuracy_score(y_test, model.predict(X_test))
    )

1 0.1882605583392985 0.19
3 0.11023622047244094 0.0725
5 0.7566213314244811 0.72
10 0.9377236936292055 0.91
25 0.9384395132426628 0.89
50 0.9692197566213314 0.9425
100 0.9928418038654259 0.96
250 0.9957050823192556 0.9675
500 0.9971367215461704 0.9725
1000 0.9949892627057981 0.9725


In [141]:
for h_dim in [2000,3000]:
    
    model = make_pipeline(StandardScaler(),
                      MLPClassifier(h_dim,
                                    activation="logistic",
                                    alpha=0,
                                    max_iter=1000))
    model.fit(X_train, y_train)
    print(
        h_dim,
        accuracy_score(y_train, model.predict(X_train)), 
        accuracy_score(y_test, model.predict(X_test))
    )

2000 1.0 0.96
3000 1.0 0.9575


# Keras

In [154]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [155]:
data = load_digits()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=400)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [158]:
y_train[:3]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [159]:
model = Sequential()

model.add(Dense(100,activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dense(100,activation="relu"))
model.add(Dense(50,activation="relu"))

model.add(Dense(y_train.shape[1],activation="softmax"))

model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["categorical_accuracy"])

model.fit(X_train, y_train, epochs=10)

model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               6500      
_________________________________________________________________
dense_14 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_15 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_16 (Dense)             (None, 10)                510       
Total params: 22,160
Trainable params: 22,160
Non-trainable params: 0
_________________________________________________________________


In [160]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["categorical_accuracy"])

In [161]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff67d0db898>

In [165]:
model.evaluate(X_test, y_test)



[0.13840521097183228, 0.9725]

In [169]:
accuracy_score(y_test.argmax(1), model.predict_classes(X_test) )

0.9725

In [170]:
?model.fit

In [181]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
import os
from keras.regularizers import l2


early_stopping = EarlyStopping(patience=3, monitor="val_loss")
save_best = ModelCheckpoint("wagi.h5py",save_best_only=True)
weights_regularization = l2(0.01)

model = Sequential()

model.add(Dense(100,
                activation="relu", 
                kernel_regularizer=weights_regularization,
                input_shape=(X_train.shape[1],)))
model.add(Dense(50,
                activation="relu", 
                kernel_regularizer=weights_regularization))

model.add(Dense(y_train.shape[1],
                activation="softmax", 
                kernel_regularizer=weights_regularization))

model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["categorical_accuracy"])

model.fit(X_train, y_train, 
          validation_split=0.15, 
          callbacks=[early_stopping, save_best],
          epochs=100)

model.load_weights("wagi.h5py")
os.remove("wagi.h5py")

model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 100)               6500      
_________________________________________________________________
dense_27 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_28 (Dense)             (None, 10)                510       
Total params: 12,060
Trainable params: 12,060
Non-trainable params: 0
_________________________________________________________________
Train on 1187 samples, validate on 210 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


[0.6089582085609436, 0.9625]

In [183]:
from keras.layers import Dropout

early_stopping = EarlyStopping(patience=3, monitor="val_loss")

model = Sequential()

model.add(Dense(100,activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))

model.add(Dense(100,activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(50,activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(y_train.shape[1],activation="softmax"))

model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["categorical_accuracy"])

model.fit(X_train, y_train, 
          validation_split=0.15, 
          callbacks=[early_stopping],
          epochs=100)

model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 100)               6500      
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 10)                510       
Total para

[0.1790871960297227, 0.9675]

In [198]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import string
import re
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

newsgroups = fetch_20newsgroups(subset='train',categories=['sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space'])

X = newsgroups.data
y = newsgroups.target

len(X)

2373

In [199]:
def extract_content(mail):    
    # wyciagniecie tresci maila   
    return mail[mail.find("\n\n"):]

def extract_subject(mail):
    return re.findall(r'Subject:(.+)',mail)[0]


def stem_helper(word,stemmer):
    try:
        y = stemmer.stem(word)
    except:
        y = word
    return y



def clean_text(x, stemmer = nltk.PorterStemmer()):   
    """
    x - jeden mail
    """    
 
    #tokenizacja - rozbicie na liste tokenow
    x_t = nltk.word_tokenize(x)    
    # usuwanie znakow interpunkcyjnych
    translator = str.maketrans('', '', string.punctuation)
    x_t = [word.translate(translator) for word in x_t]    
    # zamina liter male
    x_t = [word.lower() for word in x_t]    
    # usuwanie zbednych tokenow
    stopwords = nltk.corpus.stopwords.words("english")+["nt"]
    x_t = [w for w in x_t if w not in stopwords]    
    # stemming
    x_t = [stem_helper(word,stemmer) for word in x_t]    
    # sklejenie do napisu
    x_new = ' '.join(x_t)    
    return x_new


X_c = [clean_text(extract_content(x)) for x in X]
X_s = [clean_text(extract_subject(x)) for x in X]

X = pd.DataFrame({"subject":X_s, "content":X_c})

vectorizer = ColumnTransformer([
    ("content_vectorization", CountVectorizer(max_features=3000),"content"),
    ("subject_vectorization", CountVectorizer(max_features=1000),"subject")
])

In [200]:
X

Unnamed: 0,content,subject
0,amolitor nmsuedu andrew molitor write ye ...,tap code good
1,titl say contact via email would help mike h...,want protel easytrax mac
2,articl 3hgf3b3w165w shakalacom dant shakal...,moonbas race
3,want abl take bunch homemad song dat suitabl ...,make reallyshortrun cd
4,anyon know size cold ga roll control thruster...,cold ga tank sound rocket
5,articl 1qmugcinnpu9 gapcaltechedu hal ccoc...,text white hous announc q clipper chip encrypt
6,l levin bbncom joel b levin write l jo...,select placebo
7,articl c5qwv2bz0 zootorontoedu henri zooto...,motorola xc68882rc33 rc50
8,flight test gener care coreograph go push ...,push envelop
9,al escomcom al donaldson write amolitor ...,tap code good


In [201]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [214]:
vectorizer = ColumnTransformer([
    ("content_vectorization", CountVectorizer(max_features=3000),"content"),
    ("subject_vectorization", CountVectorizer(max_features=1000),"subject")
])

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=500)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)