Read the .jsonl file into a DataFrame

In [20]:
#Libraries
import pandas as pd
import json
import string
import numpy as np

In [21]:
#Using read_json function to read .jsonl file into dataframe
#lines=True allows for reading the json file per line
contr_df = pd.read_json('controversial-comments.jsonl',lines=True)

#Viewing the data
print(contr_df.head())

   con                                                txt
0    0  Well it's great that he did something about th...
1    0                       You are right Mr. President.
2    0  You have given no input apart from saying I am...
3    0  I get the frustration but the reason they want...
4    0  I am far from an expert on TPP and I would ten...


Convert all text to lowercase letters

In [22]:
#Function for converting all text to lowercase letters
def lowerCase(string: str) -> str:
    #String lower function
    return string.lower()

#Apply the function
#Setting equal to 'txt' column in dataframe since it's being update
contr_df['txt'] = [lowerCase(string) for string in contr_df.txt]

Remove all punctuation from the text

In [23]:
#Function which removes all punctuation from a string of text
#maketrans --> returns a mapping table for translation, used by translate method
#Translates punctuation to nothing
def remove_punctuation(text: str) -> str:
    return text.translate(str.maketrans('', '', string.punctuation))

#Removing punctuation from contr_dr.txt
contr_df['txt'] = [remove_punctuation(text) for text in contr_df.txt]

Remove stop words

In [24]:
#Load library
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
#Downloading set of stop words
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

In [25]:
#Load stop words
stop_words = stopwords.words('english')

#Getting tokenized words -- tokenized copy of the text
contr_df['txt'] = contr_df['txt'].apply(word_tokenize)

In [26]:
#Removing stop words
contr_df['txt'] = [word for word in contr_df.txt if word not in stop_words]

In [27]:
contr_df

Unnamed: 0,con,txt
0,0,"[well, its, great, that, he, did, something, a..."
1,0,"[you, are, right, mr, president]"
2,0,"[you, have, given, no, input, apart, from, say..."
3,0,"[i, get, the, frustration, but, the, reason, t..."
4,0,"[i, am, far, from, an, expert, on, tpp, and, i..."
...,...,...
949995,0,"[i, genuinely, cant, understand, how, anyone, ..."
949996,0,"[as, a, reminder, this, subreddit, is, for, ci..."
949997,0,"[k, dont, explain, why, or, anything]"
949998,0,[deleted]


Apply NLTK's PorterStemmer

In [28]:
from nltk.stem.porter import PorterStemmer
#Already have word tokens from above piece of code

#Create stemmer
porter = PorterStemmer()

#Applying the stemmer
#Stems are created by removing the suffixes or prefixes used with a word
contr_df['txt'] = [[porter.stem(word) for word in words] for words in contr_df['txt']]

In [29]:
contr_df['txt']

0         [well, it, great, that, he, did, someth, about...
1                             [you, are, right, mr, presid]
2         [you, have, given, no, input, apart, from, say...
3         [i, get, the, frustrat, but, the, reason, they...
4         [i, am, far, from, an, expert, on, tpp, and, i...
                                ...                        
949995    [i, genuin, cant, understand, how, anyon, can,...
949996    [as, a, remind, thi, subreddit, is, for, civil...
949997                   [k, dont, explain, whi, or, anyth]
949998                                              [delet]
949999    [ya, sociopath, are, known, for, celebr, their...
Name: txt, Length: 950000, dtype: object

#### Three Different Techniques

Convert each text entry into a word-count vector

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
#nltk.download('averaged_perceptron_tagger')

In [40]:
#Create text
#Joining together words in each row of the dataframe to have lines of text

#Selecting 50,000 rows randomly for processing purposes
contr_df_sample = contr_df.sample(n=25000)
text_data = contr_df_sample['txt'].str.join(" ")
#text_data = [item for sublist in contr_df_sample['txt'] for item in sublist]

text_data

749532    i think we should look into the speech that wa...
769468    when did i say i thought that your make shit u...
814707                                           pleas dont
771241    and sinc he do thi on hi vacat day it cost us ...
845809    there is also the invers of the silver line ca...
                                ...                        
381443    thi is all becaus of the incred danger idea th...
623374                                             well see
97050     elon musk wa also ad to trump advisori council...
356481    tldr what you wrote wa probabl dumb abort is w...
444808    isnt thi a way to keep econom anxieti high unt...
Name: txt, Length: 25000, dtype: object

In [41]:
#Creating the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

#View feature matrix
#Using sorted on keys to preserve the same order as count.get_feature_names()
#feature_names = sorted(count.vocabulary_.keys())
#pd.DataFrame(bag_of_words.toarray(),columns=feature_names)
list(zip(count.get_feature_names(), bag_of_words.sum(0).getA1()))

[('00', 3),
 ('000', 3),
 ('00000038', 1),
 ('000327', 1),
 ('0013157894736842', 1),
 ('005', 2),
 ('006', 1),
 ('00712', 1),
 ('00811', 1),
 ('009', 2),
 ('00946', 1),
 ('01', 5),
 ('010', 1),
 ('016', 1),
 ('01664', 1),
 ('01857', 1),
 ('02', 2),
 ('025c', 1),
 ('02683', 1),
 ('03', 5),
 ('0311', 1),
 ('03285', 1),
 ('037', 2),
 ('04', 1),
 ('04583', 1),
 ('04917', 1),
 ('05', 3),
 ('05078', 1),
 ('05092', 1),
 ('05112', 1),
 ('05143', 1),
 ('05153', 1),
 ('05162013', 1),
 ('05409', 1),
 ('06', 2),
 ('06048', 1),
 ('06249', 1),
 ('07', 1),
 ('07422', 1),
 ('07882', 1),
 ('08', 14),
 ('0810', 1),
 ('09', 3),
 ('090', 1),
 ('09302001', 1),
 ('09302009', 1),
 ('09302015', 1),
 ('0g', 1),
 ('0httpwwwpolitifactcompunditfactstatements2016nov18blogpostingno3millionundocumentedimmigrantsdidnotvot',
  1),
 ('10', 165),
 ('100', 167),
 ('1000', 23),
 ('10000', 12),
 ('100000', 12),
 ('1000000', 1),
 ('10000000', 1),
 ('1001', 2),
 ('100200k', 1),
 ('10022', 1),
 ('100f', 1),
 ('100k', 11),
 ('

Convert each text entry into a part-of-speech tag vector

In [42]:
from nltk import pos_tag

#Use pre-trained part of speech tagger
#Casting to string object
text_tagged = pos_tag(word_tokenize(str(text_data)))

#Show parts of speech
text_tagged

[('749532', 'CD'),
 ('i', 'JJ'),
 ('think', 'VBP'),
 ('we', 'PRP'),
 ('should', 'MD'),
 ('look', 'VB'),
 ('into', 'IN'),
 ('the', 'DT'),
 ('speech', 'NN'),
 ('that', 'IN'),
 ('wa', 'NN'),
 ('...', ':'),
 ('769468', 'CD'),
 ('when', 'WRB'),
 ('did', 'VBD'),
 ('i', 'VB'),
 ('say', 'VBP'),
 ('i', 'JJ'),
 ('thought', 'VBD'),
 ('that', 'IN'),
 ('your', 'PRP$'),
 ('make', 'NN'),
 ('shit', 'NN'),
 ('u', 'NN'),
 ('...', ':'),
 ('814707', 'CD'),
 ('pleas', 'NNS'),
 ('dont', 'JJ'),
 ('771241', 'CD'),
 ('and', 'CC'),
 ('sinc', 'VB'),
 ('he', 'PRP'),
 ('do', 'VB'),
 ('thi', 'VB'),
 ('on', 'IN'),
 ('hi', 'JJ'),
 ('vacat', 'NN'),
 ('day', 'NN'),
 ('it', 'PRP'),
 ('cost', 'VBD'),
 ('us', 'PRP'),
 ('...', ':'),
 ('845809', 'CD'),
 ('there', 'EX'),
 ('is', 'VBZ'),
 ('also', 'RB'),
 ('the', 'DT'),
 ('invers', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('silver', 'NN'),
 ('line', 'NN'),
 ('ca', 'MD'),
 ('...', ':'),
 ('...', ':'),
 ('381443', 'CD'),
 ('thi', 'NN'),
 ('is', 'VBZ'),
 ('all', 'DT'),
 ('becaus'

Convert each entry into a term frequency-inverse document frequency (tf-idf) vector

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
#Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [45]:
#Show tf-idf feature matrix
#Using sorted on keys to preserve the same order as count.get_feature_names()
#feature_names = sorted(tfidf.vocabulary_.keys())
#feature_arr = feature_matrix.toarray()
#pd.DataFrame(feature_arr,columns=feature_names)
list(zip(tfidf.get_feature_names(), feature_matrix.sum(0).getA1()))

[('00', 0.8232030499681574),
 ('000', 0.8837024905654752),
 ('00000038', 0.08366034999796579),
 ('000327', 0.11932289049751407),
 ('0013157894736842', 0.42160000004428994),
 ('005', 0.42005800769076085),
 ('006', 0.25801503204430043),
 ('00712', 0.6403297409233798),
 ('00811', 0.6403297409233798),
 ('009', 0.3544273066413389),
 ('00946', 0.6403297409233798),
 ('01', 0.8439840043509931),
 ('010', 0.4238356951648405),
 ('016', 0.24065478576430033),
 ('01664', 0.6403297409233798),
 ('01857', 0.6403297409233798),
 ('02', 0.0649774910163946),
 ('025c', 0.1484287986596958),
 ('02683', 0.6403297409233798),
 ('03', 1.15579878424359),
 ('0311', 0.0675032842217596),
 ('03285', 0.6403297409233798),
 ('037', 0.4528034497749471),
 ('04', 0.3052684898286773),
 ('04583', 0.6403297409233798),
 ('04917', 0.6403297409233798),
 ('05', 0.45833139526004135),
 ('05078', 0.6403297409233798),
 ('05092', 0.6403297409233798),
 ('05112', 0.6403297409233798),
 ('05143', 0.6403297409233798),
 ('05153', 0.640329740

For the three techniques in problem (2) above, give an example where each would be useful.

1. Word-count vector

This technique is useful with categorical data that does not have a sort of ordering (nominal). In terms of application, since it gives us counts for the terms in a document, we can use those counts to compare documents and gauge how similar they are based on the number of times words appear (or don't appear) in them. It can be used as a form of document classification.

2. Part-of-speech tag vector

This technique is especially useful with decoding words that can have different parts-of-speech in a language. For example, the word "help" can be both a verb and a noun in English. Therefore, by having this tagging, we can more fully understand what the word is representing in the document and give it more linguistic understanding. 

3. TF-IDF feature matrix

This technique can be used in visualizations for depicting documents in relation to their most distinctive keywords. It can give meaning to a document in an appealing and easy-to-understand way by knowing the terms in a document that are distinctively frequent in a dcoument, when that document is compared with other documents. 