In [1]:
import module.readData.main as jsonHandler
import module.filterData.main as filterData
import module.filterData.constant_REs as con_REs

In [2]:
# only once
#import nltk
#nltk.download('stopwords')

___

## Query Data Live From Twitter
source: [@Amazon](https://twitter.com/amazon)

It is out of this notebook's scope

___

## Read Data

In [3]:
dataDict = jsonHandler.readJson("./amazon.json")

___

## Data Preprocessing

### Fetch Tweets Text List

In [4]:
# inspect raw data
dataDict

{'data': [{'id': '1337214201622827010',
   'text': '@amazon Her name is Izzy! :)'},
  {'id': '1337898856483188736',
   'text': "Got some amazon packages and had sibling open them and put them under the tree for me. Thank you santa's elves #tbtbsanta"},
  {'id': '1337922297617809408',
   'text': "@amazon Hahaha she's super creative. Loves to make stuff, arts &amp; crafts, drawing, painting, etc. She's a smart cookie for sure 😂❤"},
  {'id': '1337878827121328128',
   'text': 'I love Amazon Fresh. I can have all my groceries delivered in two hours 🙌🏻 https://t.co/8qcBdOh3wy'},
  {'id': '1336235270245650432',
   'text': "This purple wig is so pretty! Amazon is awesome for wigs. I'm barely getting by mentally at work and doing this is an amazing release for me. https://t.co/mqQaSWCf6a"},
  {'id': '1337899140055875587',
   'text': 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages

In [5]:
id_text_list = dataDict['data']
text_list = [x['text'] for x in id_text_list]

In [6]:
# list of tweets texts
text_list

['@amazon Her name is Izzy! :)',
 "Got some amazon packages and had sibling open them and put them under the tree for me. Thank you santa's elves #tbtbsanta",
 "@amazon Hahaha she's super creative. Loves to make stuff, arts &amp; crafts, drawing, painting, etc. She's a smart cookie for sure 😂❤",
 'I love Amazon Fresh. I can have all my groceries delivered in two hours 🙌🏻 https://t.co/8qcBdOh3wy',
 "This purple wig is so pretty! Amazon is awesome for wigs. I'm barely getting by mentally at work and doing this is an amazing release for me. https://t.co/mqQaSWCf6a",
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you don’t want to ruin Christmas https://t.co/rDikf3T7qe',
 'Amazon never fail me ain’t lying 🤣🥴.',
 'Christmas time is oh so cozy at my apartment! 1st Christmas here&amp; I absolutely love it! ❤️🌟🎄✨ My Christmas tree skirt is from @amazon 🥰 https://t.co/NUd

### To Be Replaced
- New lines, i.e "\n", with fullstops

### To Be Distilled
- Mentions, e.g @amazon
- Links
- Emojis
- Numbers
- Special Characters, e.g ' ! ', ' : ', ' ) ', ' " ', ' / ', and ' # '. They do not include ".", ",", or " ' "

In [7]:
# replace new lines with a fullstop
text_list = [filterData.replaceNewLinesWith(x, '.') for x in text_list]

# alternatively, replace with a space

In [8]:
# Distill
temReLis = [con_REs.MENTIONS, con_REs.LINKS, con_REs.HASHTAG, con_REs.EMOJIS, con_REs.NUMBERS, con_REs.SPECIAL_CHAR]
text_list = [filterData.filterStrFromReList(temReLis, x) for x in text_list]

In [9]:
text_list

[' Her name is Izzy ',
 'Got some amazon packages and had sibling open them and put them under the tree for me. Thank you santas elves ',
 ' Hahaha shes super creative. Loves to make stuff arts amp crafts drawing painting etc. Shes a smart cookie for sure ',
 'I love Amazon Fresh. I can have all my groceries delivered in two hours  ',
 'This purple wig is so pretty Amazon is awesome for wigs. Im barely getting by mentally at work and doing this is an amazing release for me. ',
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you dont want to ruin Christmas ',
 'Amazon never fail me aint lying .',
 'Christmas time is oh so cozy at my apartment st Christmas hereamp I absolutely love it  My Christmas tree skirt is from   ',
 'Working on school stuff with my yo daughter.yo How do you spell want.Me Try to sound it out please..yo ALEXA HOW DO YOU SPELL WANT.Alexa Want is

In [10]:
text_list = [filterData.stripBegEnd(x) for x in text_list]

In [11]:
text_list

['Her name is Izzy',
 'Got some amazon packages and had sibling open them and put them under the tree for me. Thank you santas elves',
 'Hahaha shes super creative. Loves to make stuff arts amp crafts drawing painting etc. Shes a smart cookie for sure',
 'I love Amazon Fresh. I can have all my groceries delivered in two hours',
 'This purple wig is so pretty Amazon is awesome for wigs. Im barely getting by mentally at work and doing this is an amazing release for me',
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you dont want to ruin Christmas',
 'Amazon never fail me aint lying',
 'Christmas time is oh so cozy at my apartment st Christmas hereamp I absolutely love it  My Christmas tree skirt is from',
 'Working on school stuff with my yo daughter.yo How do you spell want.Me Try to sound it out please..yo ALEXA HOW DO YOU SPELL WANT.Alexa Want is spelled... wan

In [12]:
# don't use dot, i.e period "." here
text_list = [filterData.removeDuplicatedCharList(x, [' ']) for x in text_list]
text_list = [filterData.removeDuplicatedPeriods(x) for x in text_list]

In [13]:
text_list

['Her name is Izzy',
 'Got some amazon packages and had sibling open them and put them under the tree for me. Thank you santas elves',
 'Hahaha shes super creative. Loves to make stuff arts amp crafts drawing painting etc. Shes a smart cookie for sure',
 'I love Amazon Fresh. I can have all my groceries delivered in two hours',
 'This purple wig is so pretty Amazon is awesome for wigs. Im barely getting by mentally at work and doing this is an amazing release for me',
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you dont want to ruin Christmas',
 'Amazon never fail me aint lying',
 'Christmas time is oh so cozy at my apartment st Christmas hereamp I absolutely love it My Christmas tree skirt is from',
 'Working on school stuff with my yo daughter.yo How do you spell want.Me Try to sound it out please.yo ALEXA HOW DO YOU SPELL WANT.Alexa Want is spelled. want.Me

In [14]:
text_list = [filterData.removeDuplicatedCharSeq(x) for x in text_list]

In [15]:
text_list

['Her name is Izzy',
 'Got some amazon packages and had sibling open them and put them under the tree for me.Thank you santas elves',
 'Hahaha shes super creative.Loves to make stuff arts amp crafts drawing painting etc.Shes a smart cookie for sure',
 'I love Amazon Fresh.I can have all my groceries delivered in two hours',
 'This purple wig is so pretty Amazon is awesome for wigs.Im barely getting by mentally at work and doing this is an amazing release for me',
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you dont want to ruin Christmas',
 'Amazon never fail me aint lying',
 'Christmas time is oh so cozy at my apartment st Christmas hereamp I absolutely love it My Christmas tree skirt is from',
 'Working on school stuff with my yo daughter.yo How do you spell want.Me Try to sound it out please.yo ALEXA HOW DO YOU SPELL WANT.Alexa Want is spelled.want.Me.Alexa

### Chunk Tweets of Multiple Sentences
by separating on fullstops

In [16]:
new_text_list = []
for el in text_list:
    new_text_list = new_text_list + el.split('.')

In [17]:
new_text_list

['Her name is Izzy',
 'Got some amazon packages and had sibling open them and put them under the tree for me',
 'Thank you santas elves',
 'Hahaha shes super creative',
 'Loves to make stuff arts amp crafts drawing painting etc',
 'Shes a smart cookie for sure',
 'I love Amazon Fresh',
 'I can have all my groceries delivered in two hours',
 'This purple wig is so pretty Amazon is awesome for wigs',
 'Im barely getting by mentally at work and doing this is an amazing release for me',
 'When your parents live out of state and they bought your presents on Amazon but you also buy shit on Amazon so you have to ask permission to open your own packages cause you dont want to ruin Christmas',
 'Amazon never fail me aint lying',
 'Christmas time is oh so cozy at my apartment st Christmas hereamp I absolutely love it My Christmas tree skirt is from',
 'Working on school stuff with my yo daughter',
 'yo How do you spell want',
 'Me Try to sound it out please',
 'yo ALEXA HOW DO YOU SPELL WANT',
 

In [19]:
new_text_list = filterData.lowerAndTermsList_docList_in(new_text_list)

In [20]:
new_text_list = filterData.removeStopwordsFromDocList(new_text_list)

In [21]:
new_text_list

[['name', 'izzy'],
 ['got', 'amazon', 'packages', 'sibling', 'open', 'put', 'tree'],
 ['thank', 'santas', 'elves'],
 ['hahaha', 'shes', 'super', 'creative'],
 ['loves',
  'make',
  'stuff',
  'arts',
  'amp',
  'crafts',
  'drawing',
  'painting',
  'etc'],
 ['shes', 'smart', 'cookie', 'sure'],
 ['love', 'amazon', 'fresh'],
 ['groceries', 'delivered', 'two', 'hours'],
 ['purple', 'wig', 'pretty', 'amazon', 'awesome', 'wigs'],
 ['im', 'barely', 'getting', 'mentally', 'work', 'amazing', 'release'],
 ['parents',
  'live',
  'state',
  'bought',
  'presents',
  'amazon',
  'also',
  'buy',
  'shit',
  'amazon',
  'ask',
  'permission',
  'open',
  'packages',
  'cause',
  'dont',
  'want',
  'ruin',
  'christmas'],
 ['amazon', 'never', 'fail', 'aint', 'lying'],
 ['christmas',
  'time',
  'oh',
  'cozy',
  'apartment',
  'st',
  'christmas',
  'hereamp',
  'absolutely',
  'love',
  'christmas',
  'tree',
  'skirt'],
 ['working', 'school', 'stuff', 'yo', 'daughter'],
 ['yo', 'spell', 'want']