In [1]:
import pandas as pd
import numpy as np
import jsonlines
import re
import sys
import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import DictVectorizer
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Import Jsonlines file
with open("controversial-comments.jsonl") as f:
    lines = f.read().splitlines()
lines

['{"con": 0, "txt": "Well it\'s great that he did something about those beliefs while he was in office. I doubt Trump would fight the UN for it so I\'m really really happy Obama did something while he could.......oh wait...."}',
 '{"con": 0, "txt": "You are right Mr. President."}',
 '{"con": 0, "txt": "You have given no input apart from saying I am wrong. You have no argument clearly."}',
 '{"con": 0, "txt": "I get the frustration but the reason they want them to do it that way is because its the foundation for more complex problems as they advance in grades. I can get a decent grade on an SAT type test for math, but I don\'t really understand a lot of the mathematical ways to get the right answer. Lots of times you can figure out a \\"common sense\\" work around to a lot of the questions, but I would be ill prepared to take college level math courses despite the above average math score. \\n\\nThey\'re not just trying to bust the kids balls."}',
 '{"con": 0, "txt": "I am far from an e

In [3]:
#Create dataframe from the jsonlines file
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']
df_inter.head()

Unnamed: 0,json_element
0,"{""con"": 0, ""txt"": ""Well it's great that he did..."
1,"{""con"": 0, ""txt"": ""You are right Mr. President.""}"
2,"{""con"": 0, ""txt"": ""You have given no input apa..."
3,"{""con"": 0, ""txt"": ""I get the frustration but t..."
4,"{""con"": 0, ""txt"": ""I am far from an expert on ..."


In [4]:
#Seperate the con field and the txt field by using the json.loads function
df_inter['json_element'].apply(json.loads)

0         {'con': 0, 'txt': 'Well it's great that he did...
1         {'con': 0, 'txt': 'You are right Mr. President.'}
2         {'con': 0, 'txt': 'You have given no input apa...
3         {'con': 0, 'txt': 'I get the frustration but t...
4         {'con': 0, 'txt': 'I am far from an expert on ...
                                ...                        
949995    {'con': 0, 'txt': 'I genuinely can't understan...
949996    {'con': 0, 'txt': 'As a reminder, this subredd...
949997    {'con': 0, 'txt': 'K. Don't explain why or any...
949998                       {'con': 0, 'txt': '[deleted]'}
949999    {'con': 0, 'txt': 'Ya, sociopaths are known fo...
Name: json_element, Length: 950000, dtype: object

In [5]:
#Clean the dataframe to allow for a txt field.
dfTwitter = pd.json_normalize(df_inter['json_element'].apply(json.loads))
dfTwitter.head()

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...


In [6]:
#Convert all text in the txt field to lowercase
dfTwitter['txt'] = dfTwitter['txt'].str.lower()

In [7]:
dfTwitter.head()

Unnamed: 0,con,txt
0,0,well it's great that he did something about th...
1,0,you are right mr. president.
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


In [8]:
#Remove all punctuation from the txt field
dfTwitter["txt"] = dfTwitter['txt'].str.replace('[^\w\s]','')
dfTwitter.head()

Unnamed: 0,con,txt
0,0,well its great that he did something about tho...
1,0,you are right mr president
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


In [9]:
#Create word tokens within each of the tweets
dfTwitter['txt'] = dfTwitter.apply(lambda x : word_tokenize(str(x['txt'])), axis =1)
dfTwitter.head()

Unnamed: 0,con,txt
0,0,"[well, its, great, that, he, did, something, a..."
1,0,"[you, are, right, mr, president]"
2,0,"[you, have, given, no, input, apart, from, say..."
3,0,"[i, get, the, frustration, but, the, reason, t..."
4,0,"[i, am, far, from, an, expert, on, tpp, and, i..."


In [10]:
#Instantiate a variable with all stop words as defined by NLTK
stop = stopwords.words('english')

In [11]:
#Remove stopwords from each of the rows in the dataframe
dfTwitter['txt'] = dfTwitter['txt'].apply(lambda x: [word for word in x if word not in (stop)])
#dfTwitter['txt'] = dfTwitter['txt'].apply(lambda x: ' '.join([word for word in x if word not in (stop)]))
dfTwitter.head()

Unnamed: 0,con,txt
0,0,"[well, great, something, beliefs, office, doub..."
1,0,"[right, mr, president]"
2,0,"[given, input, apart, saying, wrong, argument,..."
3,0,"[get, frustration, reason, want, way, foundati..."
4,0,"[far, expert, tpp, would, tend, agree, lot, pr..."


In [12]:
#Instatiate the porterstemmer
ps = PorterStemmer()

In [13]:
#Convert all words in the txt field to their root word
dfTwitter['txt'] = dfTwitter['txt'].apply(lambda x : [ps.stem(y) for y in x])

In [14]:
dfTwitter.head()

Unnamed: 0,con,txt
0,0,"[well, great, someth, belief, offic, doubt, tr..."
1,0,"[right, mr, presid]"
2,0,"[given, input, apart, say, wrong, argument, cl..."
3,0,"[get, frustrat, reason, want, way, foundat, co..."
4,0,"[far, expert, tpp, would, tend, agre, lot, pro..."


In [15]:
#Create an array of only the words in the txt field
wordDict = dfTwitter['txt'].to_dict()
wordArray = []
for sent in range(0,len(wordDict)):
    for word in wordDict[sent]:
        wordArray.append(word)

In [16]:
#Create a word count vector
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
#Instatiate the Count vectorizer object
cv = CountVectorizer()

In [18]:
#Used only 10001 elements as the full array maxed out my PCs memory
#Create the count array
cdict = cv.fit_transform(wordArray[0:10000]).toarray()

In [19]:
cdict

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
#Create a part of speech tag vector
tagged_tweets = []
tweetList = dfTwitter['txt'][0:100].to_list()
for tweet in tweetList:
    tweet_tag = pos_tag(tweet)
    tagged_tweets.append([tag for word, tag in tweet_tag])

In [21]:
#Convert tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
#Show classes of the part of speech tagging matrix
one_hot_multi.classes_

array(['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN',
       'NNP', 'NNS', 'PRP', 'PRP$', 'RB', 'RBR', 'RP', 'UH', 'VB', 'VBD',
       'VBN', 'VBP', 'VBZ', 'WDT', 'WP'], dtype=object)

In [23]:
#Create a TFIDF object
tfidf = TfidfVectorizer()

In [24]:
#Create a TFIDF
#Only used 10001 elements as the full array maxed out my PCs memory.
feature_matrix = tfidf.fit_transform(wordArray[0:10000]).toarray()
feature_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
'''Follow up question
For 2B, that has a great use case where it can be seen already in use in industry
by looking at warranty claim data for vehicles. Most vehicle manufacturers want to know
what happened when their vehicle failed. The warranty claims have text in them that
gives the group a look into the what happened. By having a bag of words approach like this, 
we can see highlighted issues like "fire", "electrical fault", etc.

Following up on my previous example, we could also use a term frequency document
to calculate how strong a chance of a claim being a false positive. For example, looking at "fire".
If the phrase is entered once, but there are other key phrases like "electrical fault" 
and "break failure", then those two would be the more important phrases in the claim
and therefore would be bucketed in either break failure or electrical fault instead of fire.

The use case for a word count vector would allow someone to see how often a word is brought up. 
When looking at text and doing some type of analysis on it, you could use a counter of keywords to use for weighting 
a training dataset appropriately. 
'''

'Follow up question\nFor 2B, that has a great use case where it can be seen already in use in industry\nby looking at warranty claim data for vehicles. Most vehicle manufacturers want to know\nwhat happened when their vehicle failed. The warranty claims have text in them that\ngives the group a look into the what happened. By having a bag of words approach like this, \nwe can see highlighted issues like "fire", "electrical fault", etc.\n\nFollowing up on my previous example, we could also use a term frequency document\nto calculate how strong a chance of a claim being a false positive. For example, looking at "fire".\nIf the phrase is entered once, but there are other key phrases like "electrical fault" \nand "break failure", then those two would be the more important phrases in the claim\nand therefore would be bucketed in either break failure or electrical fault instead of fire.\n\nThe use case for a word count vector would allow someone to see how often a word is brought up. \nWhen 