## Assignment 9.3
### Zach Hill
### 15FEB2020
### DSC-550-T302

In [26]:
import numpy as np
import pandas as pd
import os
import nltk
import string
import re

from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus.reader.api import CorpusReader
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from keras import layers, models, optimizers
from keras.preprocessing import text, sequence
from keras.layers import Dense
from keras.models import Sequential

In [None]:
nltk.download()

In [2]:
data = pd.read_json('./categorized-comments.jsonl', lines = True, orient = 'records')

In [3]:
data.describe

<bound method NDFrame.describe of                 cat                                                txt
0            sports  Barely better than Gabbert? He was significant...
1            sports  Fuck the ducks and the Angels! But welcome to ...
2            sports  Should have drafted more WRs.\n\n- Matt Millen...
3            sports            [Done](https://i.imgur.com/2YZ90pm.jpg)
4            sports                                      No!! NOO!!!!!
5            sports  Ding dong the Kaepers gone!!!!!! Yes!!!! Frida...
6            sports  yup\n\nThat would be best case scenario. Still...
7            sports  I think Larry Kruger made a good point on KNBR...
8            sports  This is great to have two well-regarded RB coa...
9            sports                         7-9 next season confirmed.
10           sports  Familiarity with the system is why I have thos...
11           sports                   So basically an Alex Smith deal?
12           sports                        

In [4]:
data.columns

Index(['cat', 'txt'], dtype='object')

In [5]:
data.cat.unique()

array(['sports', 'science_and_technology', 'video_games'], dtype=object)

In [6]:
def clean_txt(text):
    no_url = re.sub(r'http\S+', '', text)
    no_punct = "".join([t for t in no_url if t not in string.punctuation])
    clean = re.sub(r'\n', '', no_punct)
    return clean  

In [7]:
data['clean'] = data['txt'].apply(lambda x: clean_txt(x))

In [8]:
data.head(5)

Unnamed: 0,cat,txt,clean
0,sports,Barely better than Gabbert? He was significant...,Barely better than Gabbert He was significantl...
1,sports,Fuck the ducks and the Angels! But welcome to ...,Fuck the ducks and the Angels But welcome to a...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,Should have drafted more WRs Matt Millen probably
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),Done
4,sports,No!! NOO!!!!!,No NOO


In [9]:
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
data['tokenized'] = data['clean'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [12]:
data.drop(columns = 'clean').head(5)

Unnamed: 0,cat,txt,tokenized
0,sports,Barely better than Gabbert? He was significant...,"[barely, better, than, gabbert, he, was, signi..."
1,sports,Fuck the ducks and the Angels! But welcome to ...,"[fuck, the, ducks, and, the, angels, but, welc..."
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,"[should, have, drafted, more, wrs, matt, mille..."
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),[done]
4,sports,No!! NOO!!!!!,"[no, noo]"


In [13]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [14]:
data['unstopped'] = data['tokenized'].apply(lambda x: remove_stopwords(x))

In [16]:
data.drop(columns = ['clean','tokenized']).head(5)

Unnamed: 0,cat,txt,unstopped
0,sports,Barely better than Gabbert? He was significant...,"[barely, better, gabbert, significantly, bette..."
1,sports,Fuck the ducks and the Angels! But welcome to ...,"[fuck, ducks, angels, welcome, new, niners, fans]"
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,"[drafted, wrs, matt, millen, probably]"
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),[done]
4,sports,No!! NOO!!!!!,[noo]


In [18]:
data.to_csv('data_clean.csv')

In [3]:
df = pd.read_csv('./data_clean.csv')

In [4]:
df.head(5)

Unnamed: 0,cat,unstopped
0,sports,"['barely', 'better', 'gabbert', 'significantly..."
1,sports,"['fuck', 'ducks', 'angels', 'welcome', 'new', ..."
2,sports,"['drafted', 'wrs', 'matt', 'millen', 'probably']"
3,sports,['done']
4,sports,['noo']


In [5]:
df.columns = ['label', 'text']

In [6]:
df.head(5)

Unnamed: 0,label,text
0,sports,"['barely', 'better', 'gabbert', 'significantly..."
1,sports,"['fuck', 'ducks', 'angels', 'welcome', 'new', ..."
2,sports,"['drafted', 'wrs', 'matt', 'millen', 'probably']"
3,sports,['done']
4,sports,['noo']


In [7]:
vectorizer = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}')

In [8]:
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [9]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, test_size = .25)

In [10]:
print(str(train_x.shape))
print(str(test_x.shape))
print(str(train_y.shape))
print(str(test_y.shape))

(454857, 276206)
(151619, 276206)
(454857,)
(151619,)


In [11]:
train_y.head(5)

425840    video_games
30519          sports
181997    video_games
279591    video_games
560886    video_games
Name: label, dtype: object

In [12]:
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [13]:
len(train_y), len(test_y)

(454857, 151619)

## scikit-learn

In [34]:
mlp = MLPClassifier(hidden_layer_sizes = (10, 10, 10), max_iter = 1)
mlp.fit(train_x, train_y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [35]:
prediction = mlp.predict(test_x)

In [36]:
print("Accuracy: \n", accuracy_score(test_y, prediction))

Accuracy: 
 0.8625831854846688


In [37]:
print("Classification Report: \n", classification_report(test_y,prediction))

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.54      0.67      6240
           1       0.85      0.63      0.72     36701
           2       0.87      0.96      0.91    108678

    accuracy                           0.86    151619
   macro avg       0.86      0.71      0.77    151619
weighted avg       0.86      0.86      0.85    151619



In [38]:
print("Confusion Matrix: \n", confusion_matrix(test_y, prediction))

Confusion Matrix: 
 [[  3382    119   2739]
 [    25  23158  13518]
 [   515   3919 104244]]


## Keras

In [15]:
classifier = Sequential()

classifier.add(Dense(units = 100,activation = "relu", input_shape = (train_x.shape[1],)))
classifier.add(Dense(units = 50, activation = "relu"))
classifier.add(Dense(units = 10, activation = "softmax"))

classifier.compile(optimizer = "rmsprop", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [19]:
classifier.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 500)               138103500 
_________________________________________________________________
dense_5 (Dense)              (None, 50)                25050     
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 204       
Total params: 138,128,754
Trainable params: 138,128,754
Non-trainable params: 0
_________________________________________________________________


In [17]:
classifier.fit(train_x, train_y, batch_size = 2000, epochs = 1, verbose = 1)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x1dcdac4bda0>

In [20]:
loss, accuracy = classifier.evaluate(test_x, test_y)



[0.34976132414870764, 0.8619434237480164]

In [None]:
print("Loss: ", loss)
print("\nAccuracy: ", accuracy)

In [21]:
prediction = classifier.predict_classes(test_x)

In [31]:
print("Accuracy: \n", accuracy_score(test_y, prediction))

Accuracy: 
 0.861943423977206


In [29]:
print("Classification Report: \n", classification_report(test_y,prediction))

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.52      0.66      6240
           1       0.89      0.59      0.71     36701
           2       0.86      0.97      0.91    108678

    accuracy                           0.86    151619
   macro avg       0.88      0.70      0.76    151619
weighted avg       0.87      0.86      0.85    151619



In [30]:
print("Confusion Matrix: \n", confusion_matrix(test_y, prediction))

Confusion Matrix: 
 [[  3274     85   2881]
 [    31  21772  14898]
 [   399   2638 105641]]


## Image Classification

In [50]:
import numpy as np
import tensorflow as tf

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

In [61]:
# Set that the color channel value will be first
K.set_image_data_format("channels_first")

# Set seed
np.random.seed(0)

# Set image information
channels = 1
height = 28
width = 28

# Load data and target from MNIST data
(data_train, target_train), (data_test, target_test) = mnist.load_data()

# Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

# Reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

# Rescale pixel intensity to between 0 and 1
features_train = data_train / 255
features_test = data_test / 255

# One-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)

number_of_classes = target_test.shape[1]

# Start neural network
network = Sequential()

### Image Classification could not proceed with the books method. There is likely an issue with version in tensorflow, as indicated below, but I could not find a resolution after more than 72 hours of troubleshooting. Due to the extremely long processing time of model fitting, I do not have any more time to trouble shoot.

In [69]:
# Add convolutional layer with 64 filters, a 5x5 window, and ReLU activation function
network.add(Conv2D(filters=64,
                   kernel_size=(5, 5),
                   input_shape=(channels, width, height),
                   activation='relu'))

# Add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

# Add dropout layer
network.add(Dropout(0.5))

# Add layer to flatten input
network.add(Flatten())

# # Add fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation="relu"))

# Add dropout layer
network.add(Dropout(0.5))

# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

AttributeError: module 'tensorflow_core._api.v2.config' has no attribute 'experimental_list_devices'

In [59]:
# Train neural network
network.fit(features_train, # Features
    target_train, # Target
    epochs=2, # Number of epochs
    verbose=0, # Don't print description after each epoch
    batch_size=1000, # Number of observations per batch
    validation_data=(features_test, target_test)) # Data for evaluation

RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.

In [55]:
tf.config.experimental.list_physical_devices(device_type=None)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]