In [3]:
import pandas as pd
import numpy as np
import unicodedata
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# 1. Neural Network Classifier with Scikit

In [4]:
#Importing json file into a dataframe
link = r'C:\Users\nickm\Documents\categorized-comments.jsonl'
data = pd.read_json(link,lines=True)

In [5]:
data.head(20)

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!
5,sports,Ding dong the Kaepers gone!!!!!! Yes!!!! Frida...
6,sports,yup\n\nThat would be best case scenario. Still...
7,sports,I think Larry Kruger made a good point on KNBR...
8,sports,This is great to have two well-regarded RB coa...
9,sports,7-9 next season confirmed.


In [6]:
data.cat.value_counts()

video_games               435542
sports                    145823
science_and_technology     25111
Name: cat, dtype: int64

In [7]:
sampledata1= data.sample(frac=0.01)

In [8]:
sampledata1.cat.value_counts()

video_games               4313
sports                    1493
science_and_technology     259
Name: cat, dtype: int64

In [9]:
#convert all text into lowercase letters
sampledata1['txt'] = sampledata1['txt'].str.lower()

In [10]:
sampledata1.head()

Unnamed: 0,cat,txt
470375,video_games,"they'll never re-release the original though, ..."
487745,video_games,it doesn't do shit unless you get like 50
308158,video_games,i remember when that game first dropped. yeah ...
334731,video_games,"totally understandable man, it just annoys me ..."
390021,video_games,i don't doubt it. but citing their $$$ as a re...


In [11]:
#creating punctuation dictionary
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P'))

In [12]:
#removing punctuation for each string in the data frame
sampledata1['txt']=[string.translate(punctuation) for string in sampledata1['txt']]

In [13]:
sampledata1.head(10)

Unnamed: 0,cat,txt
470375,video_games,theyll never rerelease the original though bec...
487745,video_games,it doesnt do shit unless you get like 50
308158,video_games,i remember when that game first dropped yeah d...
334731,video_games,totally understandable man it just annoys me w...
390021,video_games,i dont doubt it but citing their $$$ as a reas...
513493,video_games,yes it does because the default game mode gaur...
504568,video_games,no problem
104868,sports,oh my god that tyetie joke was bad even by the...
223494,sports,your submission has been automatically removed...
117967,video_games,you win this comment section


In [14]:
#importing libs
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(max_features=1000)

In [16]:
#applying tfidf
feature_matrix = tfidf.fit_transform(sampledata1.txt)

In [17]:
#term frequency-inverse document frequency components 
tfidf.vocabulary_

{'theyll': 861,
 'never': 566,
 'the': 852,
 'original': 602,
 'though': 872,
 'because': 90,
 'of': 582,
 'it': 433,
 'doesnt': 229,
 'do': 227,
 'shit': 755,
 'unless': 907,
 'you': 990,
 'get': 333,
 'like': 468,
 '50': 15,
 'remember': 704,
 'when': 947,
 'that': 850,
 'game': 327,
 'first': 302,
 'dropped': 240,
 'yeah': 984,
 'didnt': 219,
 'hold': 397,
 'for': 308,
 'too': 887,
 'long': 480,
 'but': 123,
 'had': 357,
 'some': 781,
 'fun': 323,
 'with': 961,
 'totally': 890,
 'man': 505,
 'just': 442,
 'me': 521,
 'see': 738,
 'people': 625,
 'saying': 728,
 'they': 860,
 'cant': 132,
 'over': 610,
 'its': 435,
 'what': 944,
 'hell': 382,
 'are': 61,
 'doing': 230,
 'could': 189,
 'in': 421,
 'week': 937,
 'easily': 249,
 'be': 88,
 'about': 20,
 'dont': 232,
 'doubt': 235,
 'their': 853,
 'as': 67,
 'reason': 694,
 'why': 954,
 'all': 35,
 'must': 555,
 'good': 346,
 'is': 429,
 'theyve': 863,
 'even': 265,
 'those': 871,
 'can': 131,
 'make': 502,
 'big': 102,
 'yes': 988,
 'do

In [18]:
data_model_x = feature_matrix

In [19]:
data_model_x.shape

(6065, 1000)

In [20]:
data_model_y = sampledata1.cat

In [21]:
data_model_y.shape

(6065,)

In [22]:
from sklearn.model_selection import train_test_split

# split the data
X_train, X_val, y_train, y_val = train_test_split(data_model_x, data_model_y, test_size =0.3, random_state=123)

In [23]:
# number of samples in each set
print("No. of samples in training set: ", X_train.shape[0])
print("No. of samples in validation set:", X_val.shape[0])

No. of samples in training set:  4245
No. of samples in validation set: 1820


In [24]:
# Survived and not-survived
print('\n')
print('No. of Controversial and Not_Controversial in the training set:')
print(y_train.value_counts())

print('\n')
print('No. of Controversial and Not_Controversial in the validation set:')
print(y_val.value_counts())



No. of Controversial and Not_Controversial in the training set:
video_games               3032
sports                    1044
science_and_technology     169
Name: cat, dtype: int64


No. of Controversial and Not_Controversial in the validation set:
video_games               1281
sports                     449
science_and_technology      90
Name: cat, dtype: int64


In [25]:
from sklearn.neural_network import MLPClassifier

In [26]:
mlp = MLPClassifier(hidden_layer_sizes=[500,150], verbose=True)

In [27]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler(with_mean=False)
# Fit only to the training data
#scaler.fit(X_train)

In [28]:
#X_train = scaler.transform(X_train)
#X_val = scaler.transform(X_val)

In [29]:
 mlp.fit(X_train,y_train)

Iteration 1, loss = 0.82512548
Iteration 2, loss = 0.67314147
Iteration 3, loss = 0.59130945
Iteration 4, loss = 0.49340592
Iteration 5, loss = 0.41413721
Iteration 6, loss = 0.34246580
Iteration 7, loss = 0.27009692
Iteration 8, loss = 0.20802696
Iteration 9, loss = 0.15805502
Iteration 10, loss = 0.12709325
Iteration 11, loss = 0.10733868
Iteration 12, loss = 0.09795267
Iteration 13, loss = 0.09204115
Iteration 14, loss = 0.09227093
Iteration 15, loss = 0.08780815
Iteration 16, loss = 0.08420327
Iteration 17, loss = 0.08325661
Iteration 18, loss = 0.08255779
Iteration 19, loss = 0.08270707
Iteration 20, loss = 0.08552053
Iteration 21, loss = 0.08259642
Iteration 22, loss = 0.08144617
Iteration 23, loss = 0.08269969
Iteration 24, loss = 0.08094393
Iteration 25, loss = 0.08035609
Iteration 26, loss = 0.07856357
Iteration 27, loss = 0.07894608
Iteration 28, loss = 0.08289879
Iteration 29, loss = 0.08054147
Iteration 30, loss = 0.08001262
Iteration 31, loss = 0.08171348
Iteration 32, los

MLPClassifier(hidden_layer_sizes=[500, 150], verbose=True)

In [30]:
predictions = mlp.predict(X_val)

In [31]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_val,predictions))

[[  14    7   69]
 [   3  159  287]
 [  12  114 1155]]


In [32]:
print(classification_report(y_val,predictions))

                        precision    recall  f1-score   support

science_and_technology       0.48      0.16      0.24        90
                sports       0.57      0.35      0.44       449
           video_games       0.76      0.90      0.83      1281

              accuracy                           0.73      1820
             macro avg       0.61      0.47      0.50      1820
          weighted avg       0.70      0.73      0.70      1820



# 2. Neural Network Classifier with Keras

In [33]:
#Importing json file into a dataframe
link = r'C:\Users\nickm\Documents\categorized-comments.jsonl'
datak = pd.read_json(link, lines=True)

In [34]:
datak.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [35]:
sampledata2= datak.sample(frac=0.10)

In [36]:
sampledata2.head()

Unnamed: 0,cat,txt
56402,sports,[deleted]
107337,sports,It's not the mistake of the franchise owners b...
350003,video_games,To enter giveaways you need either 20 comment ...
59034,sports,we are actually about half and half the last 7...
100995,sports,Renshaw knows about Ashwin's 14th over so he j...


In [37]:
#convert all text into lowercase letters
sampledata2['txt'] = sampledata2['txt'].str.lower()

In [83]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [39]:
#creating punctuation dictionary
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P'))

In [40]:
#removing punctuation for each string in the data frame
sampledata2['txt']=[string.translate(punctuation) for string in sampledata2['txt']]

In [41]:
sampledata2.head(10)

Unnamed: 0,cat,txt
56402,sports,deleted
107337,sports,its not the mistake of the franchise owners bu...
350003,video_games,to enter giveaways you need either 20 comment ...
59034,sports,we are actually about half and half the last 7...
100995,sports,renshaw knows about ashwins 14th over so he ju...
308796,video_games,very few games have ever achieved that kind of...
349031,video_games,it has its learning curve for sure not saying ...
34317,sports,yeah hopefully thats why he has so few offers
528816,video_games,thats one at least thanks
174660,video_games,those talents are better than average there ar...


In [42]:
#importing stopwords list
from nltk.corpus import stopwords

In [43]:
#importing tokenization libs
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk

In [44]:
#tokenizing newtext data
sampledata2['txt'] = [word_tokenize(word) for word in sampledata2['txt']]

In [45]:
sampledata2.head()

Unnamed: 0,cat,txt
56402,sports,[deleted]
107337,sports,"[its, not, the, mistake, of, the, franchise, o..."
350003,video_games,"[to, enter, giveaways, you, need, either, 20, ..."
59034,sports,"[we, are, actually, about, half, and, half, th..."
100995,sports,"[renshaw, knows, about, ashwins, 14th, over, s..."


In [46]:
stop_words=stopwords.words('english')

In [47]:
#function to remove stopwords
def remove_stopwords(sent):
    return [word for word in sent if word not in stop_words]

In [48]:
#applying remove_stopwords function
sampledata2['txt'] = sampledata2.txt.apply(remove_stopwords)

In [49]:
sampledata2.head()

Unnamed: 0,cat,txt
56402,sports,[deleted]
107337,sports,"[mistake, franchise, owners, media, company, f..."
350003,video_games,"[enter, giveaways, need, either, 20, comment, ..."
59034,sports,"[actually, half, half, last, 7, years, overper..."
100995,sports,"[renshaw, knows, ashwins, 14th, shit, pants]"


In [50]:
sampledata2.cat.value_counts()

video_games               43479
sports                    14626
science_and_technology     2543
Name: cat, dtype: int64

In [51]:
sampledata2['txt'] = sampledata2['txt'].apply(lambda text: ' '.join(text))

In [52]:
sampledata2.head()

Unnamed: 0,cat,txt
56402,sports,deleted
107337,sports,mistake franchise owners media company falsely...
350003,video_games,enter giveaways need either 20 comment karma 2...
59034,sports,actually half half last 7 years overperforming...
100995,sports,renshaw knows ashwins 14th shit pants


In [53]:
#importing libs
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
tfidf = TfidfVectorizer(max_features=3000)

In [55]:
#seperating out newtext data and applying a string function
#tag_data = sampledata.txt.apply(str)

In [56]:
#applying tfidf
feature_matrix2 = tfidf.fit_transform(sampledata2.txt)

In [57]:
tfidf.vocabulary_

{'deleted': 716,
 'mistake': 1670,
 'franchise': 1046,
 'media': 1609,
 'company': 561,
 'reported': 2164,
 'name': 1720,
 'place': 1919,
 'guy': 1168,
 'read': 2099,
 'news': 1749,
 'reports': 2166,
 'assumed': 231,
 'one': 1807,
 'enter': 867,
 'giveaways': 1108,
 'need': 1734,
 'either': 840,
 '20': 21,
 'comment': 552,
 'karma': 1403,
 'total': 2717,
 'comments': 553,
 'past': 1868,
 'month': 1693,
 'please': 1938,
 'active': 100,
 'entry': 870,
 'removed': 2155,
 'actually': 103,
 'half': 1175,
 'last': 1446,
 'years': 2980,
 'knows': 1434,
 'shit': 2343,
 'games': 1078,
 'ever': 888,
 'kind': 1421,
 'mark': 1583,
 'couple': 638,
 'think': 2659,
 'top': 2713,
 'head': 1203,
 'star': 2491,
 'course': 639,
 'ridiculous': 2196,
 'dont': 784,
 'get': 1096,
 'hey': 1235,
 'guess': 1162,
 'beat': 300,
 'go': 1117,
 'bowling': 370,
 'stuff': 2545,
 'night': 1756,
 'learning': 1463,
 'sure': 2580,
 'saying': 2260,
 'madden': 1560,
 'doesnt': 780,
 'feel': 968,
 'playing': 1933,
 'casual':

In [58]:
feature_matrix2.shape

(60648, 3000)

In [59]:
data_model_x2 = feature_matrix2

In [60]:
data_model_x2.shape

(60648, 3000)

In [61]:
data_model_y2 = sampledata2.cat

In [62]:
data_model_y2

56402                     sports
107337                    sports
350003               video_games
59034                     sports
100995                    sports
                   ...          
109719                    sports
16643     science_and_technology
231217                    sports
345522               video_games
567758               video_games
Name: cat, Length: 60648, dtype: object

In [68]:
data_model_y2.shape

(60648,)

In [202]:
from sklearn.preprocessing import OneHotEncoder


In [71]:
from sklearn.model_selection import train_test_split


# split the data

X_train2, X_val2, y_train2, y_val2 = train_test_split(data_model_x2, data_model_y2, test_size =0.3, random_state=5)


In [73]:
#checking shape
y_train2_onehot.shape

(60648, 3)

In [70]:
#checking shape
data_model_y2.shape

(60648,)

In [67]:
#checking shape
data_model_x2.shape

(60648, 3000)

In [74]:
#one hot encoding label values
y_train2_onehot = pd.get_dummies(y_train2)

In [75]:
#one hot encoding label values
y_val2_onehot= pd.get_dummies(y_val2)

In [76]:
#Initializing Neural Network
classifier = Sequential()

In [77]:
# Adding the input layer and the first hidden layer
classifier.add(Dense( 500, activation = 'relu', input_dim = 3000))
# Adding the second hidden layer
classifier.add(Dense( 150, activation = 'relu'))
# Adding the output layer
classifier.add(Dense(3, activation = 'softmax'))

# Compiling Neural Network
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [80]:
#checking shape
X_train2.shape

(42453, 3000)

In [78]:
#checking shape
y_train2_onehot.shape

(42453, 3)

In [86]:
#using todense feature to handle 0 values
X_train2= X_train2.todense()

In [87]:
# Fitting our model 
classifier.fit(X_train2, y_train2_onehot, batch_size = 128, epochs = 200, shuffle =True)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x290ad667f40>

# 3. Classifying Images

In [122]:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

In [123]:
#Set that the color channel value will be first
K.set_image_data_format("channels_last")

In [124]:
#Set Seed
np.random.seed(0)

In [125]:
#Set Image information
channels = 1
height = 28
width = 28

In [126]:
#Load data and target from MNIST data
(data_train, target_train), (data_test, target_test)= mnist.load_data()

In [127]:
data_train.shape

(60000, 28, 28)

In [128]:
target_train.shape

(60000,)

In [129]:
data_test.shape

(10000, 28, 28)

In [130]:
target_test.shape

(10000,)

In [131]:
#Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], height, width, channels)

In [132]:
#reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], height, width, channels)

In [133]:
#Rescale pixel intensity to between 0 and 1
features_train = data_train/255
features_test = data_test/255

In [134]:
#one-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

In [138]:
target_test.shape

(10000, 10)

In [None]:
target_train.shape

In [135]:
#Start neural network
network = Sequential()

In [136]:
#Add convolutional layer with 64 filters, a 5x5 window, and ReLu activation function
network.add(Conv2D(filters=64,
                    kernel_size=(5, 5),
                    input_shape=( height, width, channels),
                      activation='relu',
                     data_format='channels_last' ))
                    

In [139]:
#Add max pooling layer with 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

In [140]:
#Add dropout layer
network.add(Dropout(0.5))

In [141]:
#Add layer to flatten input
network.add(Flatten())

In [142]:
#Add fully connected layer of 128 units witha ReLU activation function
network.add(Dense(128, activation="relu"))

In [143]:
#Add dropout layer
network.add(Dropout(0.5))

In [144]:
#Aff fully connected layer with a softmax activation function 
network.add(Dense(number_of_classes, activation='softmax'))

In [145]:
#compile neural network
network.compile(loss='categorical_crossentropy',
                   optimizer = 'rmsprop',
                   metrics=['accuracy'])

In [146]:
#Train neural network
network.fit(features_train,
            target_train,
            epochs=2,
            verbose=1,
            batch_size=1000,
            validation_data=(features_test, target_test))


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2909985be20>