# Neural Network Classifiers
## Marlene Aviles 

1. Neural Network Classifier with Scikit

2. Neural Network Classifier with Keras

3. Classifying Images


In [158]:
import json
import pandas as pd
import numpy as np
import re 
from numpy import loadtxt # part 2
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential # part 2
from keras.layers import Dense # part 2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

### 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [233]:
import pandas as pd 
mlp_df = pd.read_json("categorized-comments.jsonl", lines=True)
mlp_df.head(5)

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [234]:
# N = 2347476 
# 4 different caterogies 
mlp_df.describe().transpose()

Unnamed: 0,count,unique,top,freq
cat,2347476,4,video_games,1005720
txt,2347476,2097835,[deleted],93979


In [235]:
# Looking deeper into the categories 
mlp_df["cat"].unique() # selectig 4 unique cat
mlp_df.groupby(["cat"]).size() # size of the cat

cat
news                       408311
science_and_technology     158246
sports                     775199
video_games               1005720
dtype: int64

In [236]:
# Create function to clean text 

def clean_data(text):
    
    """
    Removes the punctuations,special characters, and 
    sets texts to uppercase strings using
    RegEX Python 're' module. 
    """    
    text=re.sub('&lt;/?.*?&gt;',' &lt;&gt', text)
    text=re.sub('\\d|\\W+|_',' ',text)
    text=re.sub('[^a-zA-Z]'," ", text)
    text=text.upper()
    
    return text 

# Apply clean_data function to text and return clean df 

size = 50000 # sample sizes are not evenly distributed size of smallest sample
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:] # randomizing 

mlp_clean = mlp_df.groupby('cat', as_index=False).apply(fn)

# replacing df with new one that is clean from punctuation

del mlp_df

"""
    Applies clean_data function to txt column of mlp_df, 
    returns a clean df indexed by category.     
"""

mlp_clean['txt'] = mlp_clean['txt'].apply(lambda x:clean_data(x))
mlp_clean.reset_index(drop=True, inplace=True)

mlp_clean.tail() # verify function was applied and worked

Unnamed: 0,cat,txt
199995,video_games,THAT S THE JOKE
199996,video_games,YEAH THE PISTOL WAS DEFINITELY OP IN ME
199997,video_games,CHECKS AMAZON PRE ORDER DAMN
199998,video_games,LIKE THE OTHER USER SAID THEY CAN BE EFFECTIVE...
199999,video_games,LAPTOPS MAYBE YOU WANT TO DOWNLOAD A GAME BEFO...


In [239]:
# NLTK stop words for common english words
from nltk.corpus import stopwords
set(stopwords.words('english'))

stopwords = stopwords.words('english')

# Applying stopwords to countvec to apply to txt column 

cv = CountVectorizer(stop_words=stopwords)

# # split into input (X) and output (y) variables
X = cv.fit_transform(mlp_clean['txt'])
Y = mlp_clean['cat']

# Test/Train split, 30/70 ratio, random state 40 for seeding 

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=40)

# Looking at dimensions of test and training 

print(X_train.shape); print(X_test.shape)


(140000, 86397)
(60000, 86397)


In [246]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)



In [266]:
# Stats on Y training 
predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

from sklearn.metrics import classification_report,confusion_matrix

print("MATRIX: ",confusion_matrix(y_train,predict_train))

print(classification_report(y_train,predict_train))

MATRIX:  [[24304  3863  2091  4767]
 [10286 15918  5245  3483]
 [ 1894  5313 20985  6919]
 [ 3260  3508  9362 18802]]
                        precision    recall  f1-score   support

                  news       0.61      0.69      0.65     35025
science_and_technology       0.56      0.46      0.50     34932
                sports       0.56      0.60      0.58     35111
           video_games       0.55      0.54      0.55     34932

              accuracy                           0.57    140000
             macro avg       0.57      0.57      0.57    140000
          weighted avg       0.57      0.57      0.57    140000



In [267]:
# Stats on Y test 
print("MATRIX: ",confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

MATRIX:  [[10083  1798   925  2169]
 [ 4564  6514  2299  1691]
 [  923  2547  8300  3119]
 [ 1494  1770  4228  7576]]
                        precision    recall  f1-score   support

                  news       0.59      0.67      0.63     14975
science_and_technology       0.52      0.43      0.47     15068
                sports       0.53      0.56      0.54     14889
           video_games       0.52      0.50      0.51     15068

              accuracy                           0.54     60000
             macro avg       0.54      0.54      0.54     60000
          weighted avg       0.54      0.54      0.54     60000



### 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [None]:
conda install -c anaconda nltk

import pandas as pd

def read_file(corpus):
    """
    Function converts JSONL file into
    a pandas df for easier manipulation.    
    """       
    data = []
    
    with open('categorized-comments.jsonl', 'r') as corpus:
        for line in corpus:
            data.append(json.loads(line))

        return pd.DataFrame(data)   
        
corpus_df = read_file("categorized-comments.jsonl")
corpus_df.head(10)

In [215]:
# Second option to loading file by using argument lines-TRUE)
import pandas as pd 
corpus_df = pd.read_json("categorized-comments.jsonl", lines=True)
corpus_df.tail(10)

Unnamed: 0,cat,txt
2347466,video_games,Banned with the account for 5 years. I had my ...
2347467,video_games,"Rare Replay, would love the opportunity to go ..."
2347468,video_games,You've got to put yourself into the mindset th...
2347469,video_games,It helped.. That was what i was looking for. T...
2347470,video_games,They already sell and distribute win32 apps. T...
2347471,video_games,Same here I have over 100 hours of gameplay on...
2347472,video_games,1 might as well take a shot.
2347473,video_games,My comment. Rare replay.
2347474,video_games,Already posted
2347475,video_games,Playstation is a bigger brand and has always h...


In [216]:
# Create function to clean text 

def clean_data(text):
    
    """
    Remove punctuations and special characters
    from text, data converted into uppercase text using
    RegEX Python 're' module. 
    """    
    text=re.sub('&lt;/?.*?&gt;',' &lt;&gt', text)
    text=re.sub('\\d|\\W+|_',' ',text)
    text=re.sub('[^a-zA-Z]'," ", text)
    text=text.upper()
    
    return text 

In [217]:
# Apply clean_data function to text and return clean df 

size = 55000 # sample sizes are not evenly distributed, took average 
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:] # randomizing 

category = corpus_df.groupby('cat', as_index=False).apply(fn)

# replacing df with new one that is clean from punctuation

del corpus_df

"""
    Applies clean_data function to txt column of corupus df, 
    returns a clean df indexed by category.     
"""

category['txt'] = category['txt'].apply(lambda x:clean_data(x))
category.reset_index(drop=True, inplace=True)

category.head() # verify function was applied and worked

Unnamed: 0,cat,txt
0,news,WELL THAT S A BIT OF A GENERALIZATION I M IN T...
1,news,KEEP UPVOTING PROPAGANDA REDDIT
2,news,NOTHING WAS ALIVE WHEN WE GOT THERE NOTHING MU...
3,news,NO LYING LIKE CNN AND THE NYT MAKES A PERSON O...
4,news,I THINK THE BY PENETRATION THING IS THE PROBLE...


In [209]:
# Encoding: Label versus One Hot Encoder options 

labelencoder = LabelEncoder() # selecting encoder

cat = category["cat"]
category["cat"]=labelencoder.fit_transform(cat) # applying encoder to cat column in df 
category.groupby("cat").count() # groups by category total count 

Unnamed: 0_level_0,txt
cat,Unnamed: 1_level_1
0,55000
1,55000
2,55000
3,55000


In [194]:
# Test/Train Split + Loading Stopwords from NLTK 

import nltk

from nltk.corpus import stopwords
set(stopwords.words('english'))

stop_words = stopwords.words('english')

N_FEATURES = 5000 
N_CLASSES = 1
N_UNITS = 2500

countvectorizer = CountVectorizer(analyzer='word',
                     stop_words=stop_words, 
                     max_features = N_FEATURES,
                     max_df = 0.5,
                     min_df = 3)

# # split into input (X) and output (y) variables
X = countvectorizer.fit_transform(category['txt'])

y = category['cat']


# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [195]:
print(X_train.shape, y_train.shape,y_test.shape )

(442400, 5000) (442400,) (189600,)


In [196]:
# Classifier creation with (layers and nodes selected here)

classifier = Sequential()

# Adding layers with Relu and Softmax activation functions 
classifier.add(Dense(units=500,activation="relu",input_shape=(N_FEATURES,)))
classifier.add(Dense(units=4, activation="softmax"))


In [197]:
# compiling using .compile neural network 
classifier.compile(loss="sparse_categorical_crossentropy",optimizer="rmsprop", # loss method + root mean square propagation  
                       metrics=["accuracy"]) # accuaray performance metric 

In [198]:
# Applying classifiers to training data, 5 interations (more iterations = better accuracy, min error)

classifier.fit(X_train, y_train, batch_size=128, epochs=5, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1d25ec6850>

In [200]:
# Stats of classifiers: Confusion Matrix, Accuracy, F1- score etc.

classifier.summary() 

loss, accuracy = classifier.evaluate(X_test, y_test, verbose=1) # eval loss testing set 
print("ACCURACY (TRAINING SET): {:.4f}".format(accuracy))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_95 (Dense)             (None, 500)               2500500   
_________________________________________________________________
dense_96 (Dense)             (None, 4)                 2004      
Total params: 2,502,504
Trainable params: 2,502,504
Non-trainable params: 0
_________________________________________________________________
ACCURACY (TRAINING SET): 0.6650


In [201]:
# Creating a prediction on the classifier to get accuracy, recall, precision etc. 
y_pred = classifier.predict_classes(X_test)

In [202]:
print("ACCURACY: ", accuracy_score(y_test,y_pred)) # acc

ACCURACY:  0.6650105485232067


In [203]:
print("REPORT: ", classification_report(y_test,y_pred)) # F-1, recall, precision, support

REPORT:                precision    recall  f1-score   support

           0       0.64      0.67      0.65     47201
           1       0.73      0.63      0.68     47448
           2       0.60      0.78      0.68     47629
           3       0.73      0.59      0.65     47322

    accuracy                           0.67    189600
   macro avg       0.68      0.66      0.66    189600
weighted avg       0.68      0.67      0.66    189600



In [204]:
print("CONFUSION MATRIX: ", confusion_matrix(y_test, y_pred)) # matrix

CONFUSION MATRIX:  [[31489  6245  7259  2208]
 [ 8504 29914  5224  3806]
 [ 5018  1579 36930  4102]
 [ 4027  3334 12208 27753]]


### 3. Classifying Images

In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [255]:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

# Set that the color channel value will be first
K.set_image_data_format("channels_first")

# Seeding
np.random.seed(0)

# Setting channels (pixels)
channels = 1
height = 28
width = 28

# Loading Target/Training from MNIST data
(data_train, target_train), (data_test, target_test) = mnist.load_data()

# Reshape TRAINING IMAGE = feautures
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

# Reshape  TEST IMAGE = features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

# Rescaling pixel intensity (0 through 1)
features_train = data_train / 255
features_test = data_test / 255

# Converting target using one-hot encoding
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]


In [256]:
# Intitating nn

network = Sequential()

# Convolutional layer # filters, #x# window, ReLU activation function
network.add(Conv2D(filters=64,
                   kernel_size=(5, 5),
                   input_shape=(channels, width, height),
                   activation='relu'))

# Max pooling layer 
network.add(MaxPooling2D(pool_size=(2, 2)))

# Dropout layer
network.add(Dropout(0.5))

# Layer to flatten 
network.add(Flatten())

# # Adding nn layers / reul activation
network.add(Dense(128, activation="relu"))

# Adding dropout layer
network.add(Dropout(0.5))

# Adding nn layers / softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy",
                optimizer="rmsprop", 
                metrics=["accuracy"]) 

In [257]:
# Training NN 
network.fit(features_train, 
            target_train, 
            epochs=2, 
            verbose=0, # no description ouput
            batch_size=1000, # # of observations x batch
            validation_data=(features_test, target_test)) # Data for evaluation

<keras.callbacks.History at 0x1c026def50>

In [258]:
network.summary() 

loss, accuracy = network.evaluate(features_test, target_test, verbose=1) # eval loss testing set 
print("ACCURACY (TRAINING SET): {:.4f}".format(accuracy))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 64, 24, 24)        1664      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 64, 12, 12)        0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 64, 12, 12)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_97 (Dense)             (None, 128)               1179776   
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_98 (Dense)             (None, 10)                1290      
Total para