# Twitch NLP Pipeline

In [56]:
import numpy as np
import pandas as pd
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import string

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()



In [2]:
df = pd.read_pickle("../chat_log_data/pickle/twitch_chat_df.pkl")

In [3]:
df.head()

Unnamed: 0,index,game,channel,timestamp,user_name,chat_msg
0,55734,DOTA,admiralbulldog,2019-05-31 09:48:00,cy_dota2,Krappa
1,55735,DOTA,admiralbulldog,2019-05-31 09:48:00,asavell,ON STREAM :)
2,55758,DOTA,admiralbulldog,2019-05-31 09:48:00,gandor87,WHY IS IT PAUSED PepeHands
3,55759,DOTA,admiralbulldog,2019-05-31 09:48:00,nobody1564,LULW
4,55760,DOTA,admiralbulldog,2019-05-31 09:48:00,not_that_guy,@AdmiralBulldog what is the site where you can...


In [69]:
channel_list = df['channel'].unique()
channel_list

array(['admiralbulldog', 'dota2ruhub', 'esl_dota2', 'playhearthstone',
       'playhearthstoneru', 'solaryhs', 'edisonparklive'], dtype=object)

In [15]:
admiralbulldog = df[df['channel']=='admiralbulldog']
admiralbulldog.head()

Unnamed: 0,index,game,channel,timestamp,user_name,chat_msg
0,55734,DOTA,admiralbulldog,2019-05-31 09:48:00,cy_dota2,Krappa
1,55735,DOTA,admiralbulldog,2019-05-31 09:48:00,asavell,ON STREAM :)
2,55758,DOTA,admiralbulldog,2019-05-31 09:48:00,gandor87,WHY IS IT PAUSED PepeHands
3,55759,DOTA,admiralbulldog,2019-05-31 09:48:00,nobody1564,LULW
4,55760,DOTA,admiralbulldog,2019-05-31 09:48:00,not_that_guy,@AdmiralBulldog what is the site where you can...


In [71]:
df_channel = df[df['channel']=='edisonparklive']
df_channel.head()

Unnamed: 0,index,game,channel,timestamp,user_name,chat_msg
82082,83319,OTHER,edisonparklive,2019-05-31 09:57:00,alex182736,that one had a helmet LUL
82083,83320,OTHER,edisonparklive,2019-05-31 09:57:00,bloggerama,"okay these headshots are p sick, but doesnt ma..."
82084,83321,OTHER,edisonparklive,2019-05-31 09:57:00,theindomitablepenguin,OOF
82085,83322,OTHER,edisonparklive,2019-05-31 09:57:00,juroph,LUL
82086,83323,OTHER,edisonparklive,2019-05-31 09:57:00,humblememefarmer_,D:


### tokenize chat into words and lowercase

In [72]:
# tokens = [sent for sent in map(word_tokenize, sent_tokens)]
df_channel['word_chat_msg'] = df_channel['chat_msg'].apply(lambda x: x.lower().split())
df_channel['word_chat_msg'][:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


82082                     [that, one, had, a, helmet, lul]
82083    [okay, these, headshots, are, p, sick,, but, d...
82084                                                [oof]
82085                                                [lul]
82086                                                 [d:]
82087                                             [monkaw]
82088                                           [omegalul]
82089                                         [jesus, man]
82090                                             [monkaw]
82091                                             [monkaw]
Name: word_chat_msg, dtype: object

### Remove stopwords and punctuations

In [5]:
stopwords_ = set(stopwords.words('english'))
punctuation_ = set(string.punctuation)

In [73]:
def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

tokens_filtered = list(map(filter_tokens, df_channel['word_chat_msg']))
tokens_filtered

[['one', 'helmet', 'lul'],
 ['okay',
  'headshots',
  'p',
  'sick,',
  'doesnt',
  'make',
  'civ',
  'killing',
  'lul'],
 ['oof'],
 ['lul'],
 ['d:'],
 ['monkaw'],
 ['omegalul'],
 ['jesus', 'man'],
 ['monkaw'],
 ['monkaw'],
 ['ahaahahahah'],
 ['d:'],
 ['lulw'],
 ['f'],
 ['oof'],
 ['meemss'],
 ['lulw'],
 ['ahhahahahahahah'],
 ['kek'],
 ['lul', 'omg'],
 ['hah'],
 ['hahahaha'],
 ['lulw'],
 ['ahhahahahahahahaha'],
 ['lulw'],
 ['lul'],
 ['iam', 'crying'],
 ['lul'],
 ['bring', 'lock', 'pick'],
 ['@bloggerama', 'see', 'thats', 'slaying.', 'clean'],
 ['holy.', 'crap', 'dude'],
 ['worth'],
 ['d:'],
 ['lulw'],
 ['pog', 'clap'],
 ['bushlul'],
 ['karma'],
 ['lulw', 'start', 'lockpick'],
 ['lulw', 'hpyerclap'],
 ['lul'],
 ['hero', 'arrives'],
 ['omfg'],
 ['start', 'lockpick'],
 ['ugh!'],
 ['start', 'mission', 'lockpic'],
 ['get', 'lockpick', '4head'],
 ['d:'],
 ['lul'],
 ['take', 'lockpick'],
 ['bring', 'lock', 'pick'],
 ['@edisonparklive', 'need', 'lockpick'],
 ['hahaha'],
 ['master', 'difficult

In [74]:
df_channel['filtered_chat_msg'] = df_channel['word_chat_msg'].apply(lambda x: filter_tokens(x))
df_channel['filtered_chat_msg'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


82082                                   [one, helmet, lul]
82083    [okay, headshots, p, sick,, doesnt, make, civ,...
82084                                                [oof]
82085                                                [lul]
82086                                                 [d:]
Name: filtered_chat_msg, dtype: object

Stemming/Lemmatization
====================================================

Now that we have seen how to create a list of documents (lists of strings which are tokens), let's go through and stem / lemmatize each token. See [stemming](http://www.nltk.org/howto/stem.html) page and [lemmatization](http://www.nltk.org/_modules/nltk/stem/wordnet.html) page.

```python
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
```

1. Try running both stemmers and the lemmatizer on the documents. They only modify one word at a time, so you'll need to do a double for loop to apply the stemmer/lemmatizer to each word in all the documents.

    Save the results in 3 separate variables.

2. Compare the results. What do you notice are the differences between the two stemmers and the lemmatizer? Write your results as comments in your code!

3. Choose one of the 3 to use from here on out.

In [33]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

docs_porter = [[porter.stem(word) for word in words]
               for words in admiralbulldog['filtered_chat_msg']]
docs_snowball = [[snowball.stem(word) for word in words]
                 for words in admiralbulldog['filtered_chat_msg']]
docs_wordnet = [[wordnet.lemmatize(word) for word in words]
                for words in admiralbulldog['filtered_chat_msg']]

In [52]:
## Print the stemmed and lemmatized words from the first document
print("%16s %16s %16s %16s" % ("word", "porter", "snowball", "lemmatizer"))
for i in range(min(len(docs_porter[0]), len(docs_snowball[0]), len(docs_wordnet[0]))):
    p, s, w = docs_porter[0][i], docs_snowball[0][i], docs_wordnet[0][i]
    if len(set((p, s, w))) != 1:
        print("%16s %16s %16s %16s" % (docs[0][i], p, s, w))

            word           porter         snowball       lemmatizer


In [50]:
min(len(docs_porter), len(docs_snowball), len(docs_wordnet))

10777

In [49]:
len(docs_porter)

10777

In [42]:
for words in admiralbulldog['filtered_chat_msg']:
    print(words)

['krappa']
['stream', ':)']
['paused', 'pepehands']
['lulw']
['@admiralbulldog', 'site', 'check', 'gift?', 'tell']
['d:']
['pog']
['krappa']
['wtff']
['feelsbadman']
['bulldog', 'jewish?']
['pog']
['d:', 'xic']
['bought', ':)']
['krappa']
['lulw']
['pog']
['krappa']
['lul']
['omegalul', 'ld']
['poh']
['krappa']
['lulw']
['item', 'karappa']
["he's", 'selling', 'offstream', 'omegalul']
['krappa', 'clap']
['pogchamp']
['lul']
['krappa']
['krappa']
['balancelot!!']
['omegalul']
['pog']
['krappa']
['l', 'omegalul', 'omegalul', 'k']
['krappa']
['krappa']
['lulw']
['krappa']
['krappa']
['krappa']
['wallet', 'krappa']
['buying', 'video', 'games', 'omegalul']
['pepehands']
['gaben']
['lulw']
['luuk']
['pogu']
['krappa']
['krappa']
['krappa']
['pepehands', 'pixels']
['brought', 'admiralkristin']
['weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp', 'weirdchamp']
['yes', 'true', 'pogu']
['pog', 'professional', 'business', 'man!']
['krappa

['hows', 'alliance', 'tourney?', '@admiralbulldog']
['monkas']
['lul', 'lul']
['omegaez']
['pepega']
['ban']
['dean', 'pepehands']
['im', 'alive', 'trihard']
['@zevegas', 'peeporip']
['lul']
['!love', 'mom']
['@zevegas', 'omegalul']
['peeporip']
['monkah']
['peeporip']
['kkool']
['dean', 'pepehands']
['xxx']
['@zevegas', 'pepega', 'omegalul']
["there's", '88%', '<3', 'hamagori7', 'mom', 'feelsokayman', ':backhand_index_pointing_right:', '<3']
['kkool']
['ban', 'now!!!!!!!!!']
['get', 'rekt', 'lul']
['@zevegas', 'lulw']
['pound', 'daddy']
['pepelaugh']
['peeporip']
['sacrifice']
['kkool']
['@zevegas', 'omegalul']
['supernatural', 'song', 'pogu']
['@admiralbulldog', 'ban', 'rat', 'sub', ':)']
['omegalul']
['monkah']
['monkah']
['red']
['red']
['kkool', 'guitartime', 'supernatural']
[':)']
['ulol']
['@chadha123', '@zevegas', 'omegalul']
['monkah']
['@zevegas', 'omegalul']
['lol']
['poggies']
['@zevegas', 'lulw']
['ret+']
['reed']
['@chadha123', '@akabane85', '@zevegas', 'lulw']
['@admiral

['zulul']
['gwa', 'zulul']
['zulul']
['zulul']
['zulul', 'yuganda']
['wtff']
['zulul']
['play', ':backhand_index_pointing_right:', 'minglul']
['pepescoots']
['zulul']
['zulul']
['zulul']
['spoiders', 'pepescoots']
['zulul']
['zulul']
['zulul']
['cmonbruh']
['zulul']
['zulul']
['monkagiga']
['wtff', '?????']
['zulul']
['zulul']
['!playsound', 'uganda']
['pissed?']
['zulul']
['play', 'bulldog', 'too,', 'hes', 'beast', 'hero', 'lulw']
['gwa', 'gwa', 'zulul']
['zulul']
['play', 'inquisitor', 'martyr', 'pog', '@admiralbulldog']
['elvis']
['zulul']
['wtff']
['prostom']
['!points']
['zulul']
['lulw']
['goblins', 'dragons', 'chat?']
['rat', 'subs', 'weirdchamp']
['krappa']
['rog', 'makes', 'best', 'hardware', 'pc', 'gaming,', 'esports,', 'overclocking.', 'get', 'gear', 'here:', 'http://us.rog.gg/admiral']
['krappa']
['mingbruh']
['lul']
['krappa']
['zulul']
['rome', '2', 'lul']
['rat', 'subs', 'feelsgoodman']
['!playsound']
['lul']
['4head']
['4head']
['4head']
['4head']
['!playsound', 'uh']
[

['jebaited', 'jebaited', 'jebaited']
['jebaited']
['jebaited']
['pepehands']
['jebaited', 'clap']
['bye', 'chat', ':)']
['jebaited']
['acting', 'weirdchamp', 'acting', 'weirdchamp', 'acting', 'weirdchamp', 'acting', 'weirdchamp']
['jebaited']
['lepo', 'lepo', 'pepehands']
['lepo', 'lepo']
['weirdchamp']
['biblethump', 'biblethump', 'biblethump']
['go', 'play', 'mordhau']
['keepo']
['wtff']
['jebaited', 'jebaited', 'jebaited', 'jebaited']
['feelsweirdman']
['omegalul']
['jebaited']
['krappa', ':handshake:', 'jebaited', 'krappa', ':handshake:', 'jebaited']
['lepo', 'lepo', 'pepehands']
['pepehands']
['jebaited']
['pogu', 'acting']
['jebaited']
['low', 'prio']
['weirdchamp', 'pretending']
['lepo', 'lepo', 'biblethump']
['feelsweirdman', 'feelsweirdman', 'feelsweirdman']
['lepo', 'pepehands']
['krappa']
['sir', 'stop', 'pretending', '@admiralbulldog']
['pepehands']
['!playsound', 'bulldogaaah']
['jebaited', 'jebaited']
['pretending', 'weirdchamp', 'pretending', 'weirdchamp', 'pretending', 

['lulw']
['3', 'pepega']
['\x01action', 'opieop', "here's", 'hint,', '"_"', 'opieop\x01']
['8']
['pi']
['7']
['12']
['9']
['3']
['10']
['\x01action', 'minglee', 'one', 'could', 'answer', 'trivia!', 'answer', '"6"', 'minglee.', 'since', 'useless,', 'datguy', 'gets', 'one', 'point.\x01']
['6']
['\x01action', 'pogchamp', 'new', 'question', 'begun!', 'category', '"geography",', 'question', '"what', 'capital', 'armenia?"', ':thinking_face:\x01']
['yerevan']
['\x01action', 'mytwitchidlul', 'got', 'answer', 'right!', 'answer', 'yerevan', 'feelsgoodman', 'get', '20', 'points!', 'pogchamp\x01']
['azerbaijan']
['\x01action', 'pogchamp', 'new', 'question', 'begun!', 'category', '"entertainment",', 'question', '"what', 'first', 'movie', 'feature', 'al', 'pacino', 'robert', 'deniro', 'screen', 'together?"', ':thinking_face:\x01']
['hear']
['heat']
['\x01action', 'mytwitchidlul', 'got', 'answer', 'right!', 'answer', 'heat', 'feelsgoodman', 'get', '20', 'points!', 'pogchamp\x01']
['\x01action', 'pogc

## Bag of Words and TFIDF

In [75]:
vocab_set = set()
[[vocab_set.add(token) for token in tokens] for tokens in df_channel['filtered_chat_msg']]
vocab = list(vocab_set)

In [54]:
vocab_dict = {word: i for i, word in enumerate(vocab)}

In [57]:
word_counts = np.zeros((len(docs), len(vocab)))

for doc_id, words in enumerate(my_docs):
    for word in words:
        word_id = vocab_dict[word]
        word_counts[doc_id][word_id] += 1

NameError: name 'docs' is not defined

In [76]:
flat_list = [item for sublist in df_channel['filtered_chat_msg'] for item in sublist]

In [77]:
from collections import Counter

Counter(flat_list).most_common()

[('lul', 155),
 ('bush7', 128),
 ('d:', 125),
 ('lulw', 121),
 ('pog', 65),
 ('clap', 58),
 ('keep', 58),
 ('following', 56),
 ('dreams', 48),
 ('ez', 47),
 ('pogey', 47),
 ('edison', 46),
 ('@edisonparklive', 35),
 ('<3', 34),
 ('monkaw', 33),
 ('bushcomfy', 32),
 ('omegalul', 29),
 ('kappa', 23),
 ('lol', 23),
 ('seconds', 19),
 ('raid', 18),
 ('game', 17),
 ('fusheart', 16),
 (':)', 16),
 ('bushhypers', 16),
 ('oh', 16),
 ('edisonparklive', 15),
 ('hours,', 15),
 ('kill', 15),
 ('monkas', 15),
 ('shred', 15),
 ('bye', 15),
 ('!uptime', 14),
 ('online', 14),
 ('nice', 14),
 ('pepehands', 14),
 ('go', 14),
 ('time', 14),
 ('wtf', 14),
 ('agane', 14),
 ('loves', 14),
 ('get', 13),
 ('minutes,', 13),
 ('guy', 13),
 ('dont', 12),
 ('10', 12),
 ('jebaited', 12),
 ('cocaine', 12),
 ('kangaroos.', 12),
 ('lmao', 11),
 ('hitman', 11),
 ('like', 11),
 ('good', 11),
 ('super', 11),
 ('jake', 11),
 ('lolz', 10),
 ('hide', 10),
 ('monkers', 10),
 ('hours', 10),
 ('yeah', 10),
 ('2', 10),
 ('kkoo