In [1]:
# Import the libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Get the stopwords
stop = stopwords.words('english')

In [3]:
# Load in the data
data = pd.read_csv('../Datasets/nyt_comments/CommentsApril2017.csv')

# Have a look at the first 5 rows
data.head(n=5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,...,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245186,This project makes me happy to be a 30+ year T...,22022598.0,22022598,<br/>,comment,1491237000.0,1,False,0.0,...,"Riverside, CA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
1,1491188619,Stunning photos and reportage. Infuriating tha...,22017350.0,22017350,,comment,1491180000.0,1,False,0.0,...,<br/>,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
2,1491188617,Brilliant work from conception to execution. I...,22017334.0,22017334,<br/>,comment,1491179000.0,1,False,0.0,...,Raleigh NC,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
3,1491167820,NYT reporters should provide a contributor's l...,22015913.0,22015913,<br/>,comment,1491150000.0,1,False,0.0,...,"Missouri, USA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
4,1491167815,Could only have been done in print. Stunning.,22015466.0,22015466,<br/>,comment,1491147000.0,1,False,0.0,...,"Tucson, Arizona",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News


In [4]:
# Have a look at the shape
data.shape

(243832, 34)

### Text pre-processing

In [5]:
# Make the text lowercase
data['commentBody_processed'] = data['commentBody'].str.lower()

In [6]:
# Remove punctuation
data['commentBody_processed'] = data['commentBody_processed'].str.replace('[^\w\s]', '')

In [7]:
# Have a look at the first comment
data['commentBody_processed'][0]

'this project makes me happy to be a 30 year times subscriber continue to innovate across all platforms please'

In [8]:
# Now we'll remove the stop words
data['commentBody_processed'] = data['commentBody_processed'].apply(lambda x: x.split(' '))
data['commentBody_processed'] = data['commentBody_processed'].apply(lambda x: [el for el in x if el not in stop])
data['commentBody_processed'] = data['commentBody_processed'].apply(lambda x: ' '.join(x))

# Have a look at the first comment
data['commentBody_processed'][0]

'project makes happy 30 year times subscriber continue innovate across platforms please'

In [9]:
# Get a better feel of the word count
data['word_count'] = data['commentBody_processed'].apply(lambda x: x.count(' '))
data['word_count'].describe()

count    243832.000000
mean         38.669666
std          34.508115
min           0.000000
25%          13.000000
50%          28.000000
75%          53.000000
max         624.000000
Name: word_count, dtype: float64

In [10]:
# Define the vocab size (number of features)
VOCAB_SIZE = 10000

In [11]:
# Tokenize the text
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(list(data['commentBody_processed']))

In [12]:
# Create the sequences
X = tokenizer.texts_to_sequences(list(data['commentBody_processed']))

In [13]:
# Have a look at the first sequence
X[0]

[1797, 181, 586, 915, 148, 66, 255, 656, 6477, 273]

In [14]:
# Define the maximum sequence length
MAX_LENGTH = 100

In [15]:
# Pad the sequences
X = pad_sequences(X, maxlen=MAX_LENGTH, padding='pre')

In [16]:
# Have a look at the first sequence
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 1797,  181,  586,  915,  148,   66,  255,  656, 6477,
        273])

In [18]:
# Compare it to the original data
data['commentBody'][0]

'This project makes me happy to be a 30+ year Times subscriber... continue to innovate across all platforms, please.'