#   File: DSC550 Curry Week9/10 MidTerm.py
#   Name: Adam Curry
#   Date: 05/13/2020
#   Course: DSC540 - Data Mining
#   Desc: This program is my first neural network 
#   Usage: This program should be used when reviewing week 9/10

In [2]:
import nltk
import pandas as pd

# import the covid comments
path = r"C:\Users\adamp\OneDrive\Desktop\a_data_mining\Week9\categorized-comments.jsonl"
df = pd.read_json(path, lines=True,  encoding="utf8")

### Apply text cleanup and normailization

#### apply stemming, remove puncuation, lower case

In [3]:
import string
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')

def remove_punctuations(text):
    """
    remove punct and apply stemming and lower case all text
    """
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '').lower()
        stemmer.stem(text)
    return text

df = pd.DataFrame(df)
df['txt_no_punk'] = df['txt'].apply(remove_punctuations)

#### apply lemmatization

In [4]:
from nltk.corpus import wordnet

# add the models to an object
lemmatizer = nltk.stem.WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    """
    apply part of speech tagging
    """
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    """
    apply part of speech tagging
    """
    global wordnet_tagged
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

# Lemmatizing
df['Lemmatized'] = df['txt_no_punk'].apply(lambda x: lemmatize_sentence(x))
#df = df[df['Lemmatized'].apply(lambda x: len(x) > 10)]
print(df.head())

      cat                                                txt  \
0  sports  Barely better than Gabbert? He was significant...   
1  sports  Fuck the ducks and the Angels! But welcome to ...   
2  sports  Should have drafted more WRs.\n\n- Matt Millen...   
3  sports            [Done](https://i.imgur.com/2YZ90pm.jpg)   
4  sports                                      No!! NOO!!!!!   

                                         txt_no_punk  \
0  barely better than gabbert he was significantl...   
1  fuck the ducks and the angels but welcome to a...   
2  should have drafted more wrs\n\n matt millen p...   
3                       donehttpsiimgurcom2yz90pmjpg   
4                                             no noo   

                                          Lemmatized  
0  barely good than gabbert he be significantly w...  
1  fuck the duck and the angel but welcome to all...  
2    should have draft more wrs matt millen probably  
3                       donehttpsiimgurcom2yz90pmjpg  
4  

#### apply vectorization TFIDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',max_features=5000) 
dtm = tfidf.fit_transform(df['Lemmatized'])
df['tfidf'] = tfidf.fit_transform(df['Lemmatized'])

print(dtm)

  (0, 238)	0.23067614456683855
  (0, 3722)	0.18828809309456476
  (0, 384)	0.18254620603252011
  (0, 867)	0.1558602661088407
  (0, 4838)	0.20209144726532718
  (0, 1546)	0.14621609845316993
  (0, 3737)	0.22108733722010077
  (0, 3335)	0.19923774811481318
  (0, 2375)	0.2275824328298333
  (0, 2561)	0.15686305162093123
  (0, 2408)	0.07801989787589841
  (0, 803)	0.19212057462730373
  (0, 1375)	0.16151263064204519
  (0, 2503)	0.1570278428419804
  (0, 813)	0.1918818643682858
  (0, 3028)	0.1954816178393946
  (0, 4341)	0.21337169172815756
  (0, 515)	0.11606863336046308
  (0, 4391)	0.21927493939510231
  (0, 3740)	0.22355167978193927
  (0, 2546)	0.08113318867721633
  (0, 1998)	0.12524034164209039
  (0, 300)	0.18273186363270963
  (0, 3538)	0.18134205056805425
  (0, 3155)	0.23223089618308873
  :	:
  (606472, 1847)	0.10044998295732523
  (606472, 960)	0.14200574304339786
  (606472, 3562)	0.12678625065868954
  (606472, 4455)	0.1368752365056512
  (606472, 4456)	0.2305557926120909
  (606472, 2408)	0.09912

#### split the data

In [6]:
from sklearn.model_selection import train_test_split

X = dtm
y = df.cat

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape)

(424533, 5000)
(181943, 5000)


#### 1. Neural Network Classifier with Scikit - Fit multi classifier and predict

In [7]:
from sklearn.neural_network import MLPClassifier

# set the number of layers to match the count of features
# specify ReLU as the activation method
# adam - refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba

mlp = MLPClassifier(hidden_layer_sizes=(5,10,20),
                    activation='relu', 
                    solver='adam', max_iter=500)
mlp.fit(X_train,y_train)


predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

#### show the accuracy metrics

In [8]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score

# print the confusion matrix and the trained and tested model's performance
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[ 10525    249   6905]
 [    86  63347  38506]
 [  1294   7687 295934]]
                        precision    recall  f1-score   support

science_and_technology       0.88      0.60      0.71     17679
                sports       0.89      0.62      0.73    101939
           video_games       0.87      0.97      0.92    304915

             micro avg       0.87      0.87      0.87    424533
             macro avg       0.88      0.73      0.79    424533
          weighted avg       0.87      0.87      0.86    424533

[[  3735    238   3459]
 [   138  24196  19550]
 [  1111   6137 123379]]
                        precision    recall  f1-score   support

science_and_technology       0.75      0.50      0.60      7432
                sports       0.79      0.55      0.65     43884
           video_games       0.84      0.94      0.89    130627

             micro avg       0.83      0.83      0.83    181943
             macro avg       0.79      0.67      0.71    181943
          weighte

#### 2. Neural Network Classifier with Keras

In [1]:
# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 

Using TensorFlow backend.


#### convert categorical data to numeric

In [7]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
y = lb_make.fit_transform(y)
y

array([1, 1, 1, ..., 2, 2, 2])

#### Split the data

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape)

(424533, 5000)
(181943, 5000)


#### Load the model

In [9]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

model = Sequential()
model.add(Dense(500, activation='relu', input_dim=5000))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

#### fit the model

In [11]:
# build the model
kmodel = model.fit(X_train, y_train, epochs=1)

Epoch 1/1


#### evaluate the model

In [12]:
pred_train= model.predict(X_train)
scores = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy on training data: {}% \n Error on training data: {}'.format(scores[1], 1 - scores[1]))   
 
pred_test= model.predict(X_test)
scores2 = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores2[1], 1 - scores2[1]))    

Accuracy on training data: 0.8602888584136963% 
 Error on training data: 0.1397111415863037
Accuracy on test data: 0.8437477946281433% 
 Error on test data: 0.1562522053718567


#### 3. Classifying Images

In [14]:
import pandas as pd, numpy as np, json, re, pickle
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

In [16]:
# set color channel value will be first
K.set_image_data_format('channels_first')

In [17]:
# set random seed for consistency
np.random.seed(40)

In [18]:
#set image params
channels = 1
height = 28
width = 28

In [20]:
# load the data and targer fromm MNIST data
(data_train, target_train),(data_test,target_test) = mnist.load_data()

In [21]:
# reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

In [22]:
# reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

In [23]:
# rescale the pixel intensity to be between 0-1
features_train = data_train/255
features_test = data_test/255

In [24]:
# apply one hot encoding to target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)

In [26]:
# get the number of classes to classify
number_of_classes = target_test.shape[1]
print(number_of_classes)

10


In [30]:
# thanks to Sam from the discussion for this. Theis fixes an error in the following step
import keras.backend.tensorflow_backend as tfback
import tensorflow as tf
# Versions of tensorflow and keras seem incompatible with each other
def _get_available_gpus():  

    if tfback._LOCAL_DEVICES is None:  
        devices = tf.config.list_logical_devices()  
        tfback._LOCAL_DEVICES = [x.name for x in devices]  
    return [x for x in tfback._LOCAL_DEVICES if 'device:gpu' in x.lower()]

 
tfback._get_available_gpus = _get_available_gpus

In [31]:
# begin neural network
network = Sequential()

#Add convolutional layer with 64 filters, 5x5 window and ReLuU activation function
network.add(Conv2D(filters=64,
                   kernel_size=(5,5),
                   input_shape=(channels,width,height),
                   activation='relu'))

In [32]:
# add max pooling layer with a 2X2 window
netowrk.add(MaxPooling2D(pool_size=(2,2)))

# add dropout layer
network.add(Dropout(0.5))

# add layer to flatten input
network.add(Flatten())

In [33]:
# Add fully connected layer of 128 units with a ReLUT activation function
network.add(Dense(128,activation='relu'))

In [34]:
# add another dropoutlayer
network.add(Dropout(0.5))

In [35]:
# add fully connected layer with a softmax activation funciton
network.add(Dense(number_of_classes,activation='softmax'))

In [38]:
network.compile(loss='categorical_crossentropy',
               optimizer='rmsprop',#root mean square propagation
               metrics=['accuracy'])# accuracy of performance metric

In [39]:
# train the model
network.fit(features_train,
           target_train,
           epochs=2,
           batch_size=1000,# # of observations)
            validation_data=(features_test, target_test)) 

Train on 60000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1a4fe779e80>

In [42]:
scores = network.evaluate(features_test, target_test, verbose=0)
print('Accuracy on training data: {}% \n Error on training data: {}'.format(scores[1], 1 - scores[1])) 

Accuracy on training data: 0.9761999845504761% 
 Error on training data: 0.023800015449523926
