In [24]:
import pandas as pd
from pandasql import sqldf
import boto3
import json
import os
# python -m spacy download en
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from keras import models, layers
# from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import LabelBinarizer
# import numpy as np
from keras.utils import to_categorical
# from keras.models import Sequential
# from keras.layers import Dense
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
with open('../boto-config.json') as json_data:
    boto_config = json.load(json_data)
    
s3 = boto3.client(
    's3',
    aws_access_key_id=boto_config['aws_access_key_id'],
    aws_secret_access_key=boto_config['aws_secret_access_key']
)

In [8]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
src_file = data_dir + '/News_Category_Dataset.json'
src_file_cleaned = data_dir + '/News_Clean.csv'

# s3.upload_file(src_file, boto_config['buckets']['kaggle'], src_file)
# s3.upload_file(src_file_cleaned, boto_config['buckets']['kaggle'], src_file_cleaned)

# s3.download_file(boto_config['buckets']['kaggle'], src_file, src_file)
s3.download_file(boto_config['buckets']['kaggle'], src_file_cleaned, src_file_cleaned)

In [9]:
# data = pd.read_json(src_file, lines=True)
df = pd.read_csv(src_file_cleaned)
df = df[~df.clean_text.isnull()]

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.isnull().values.any()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
# clean up text
nlp = spacy.load('en')

data['text'] = data['headline'] + ' ' + data['short_description']

docs = data['text'].tolist()

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop | len(token.text) <= 3)

filtered_tokens = []
for doc in nlp.pipe(docs):
    tokens = [token.lemma_ for token in doc if token_filter(token) and token.text not in STOP_WORDS]
    filtered_tokens.append(' '.join(tokens))

data['clean_text'] = filtered_tokens

data.to_csv(src_file_cleaned, index=False)

In [10]:
# tfidf with sklearn
tfidf = TfidfVectorizer()

bag = tfidf.fit_transform(df.loc[:, 'clean_text'])

In [11]:
lbler = LabelEncoder()
labels = to_categorical(lbler.fit_transform(df['category']))
num_labels = labels.shape[1]

In [12]:
train = tfidf.transform(df.loc[:, 'clean_text'])

In [13]:
# train, test validation
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size = 0.2, random_state = 1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 1)

In [65]:
# nn with keras
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(num_labels, activation='softmax'))

In [66]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [67]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 256)               14561024  
_________________________________________________________________
dense_15 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 31)                2015      
Total params: 14,604,191
Trainable params: 14,604,191
Non-trainable params: 0
_________________________________________________________________


In [68]:
history = model.fit(X_train, y_train, epochs = 10, batch_size=512, validation_data=(X_val, y_val))

Train on 89978 samples, validate on 9998 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 8192/89978 [=>............................] - ETA: 27s - loss: 0.6421 - acc: 0.8258

KeyboardInterrupt: 

In [None]:
results = history.history
epochs = history.epoch

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.plot(epochs, results['acc'], label='train')
ax.plot(epochs, results['val_acc'], label='val')
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.plot(epochs, results['loss'], label='train')
ax.plot(epochs, results['val_loss'], label='val')
ax.legend()

plt.show()