## Examples

### Tokenizing the data
* Processing language to make it easier for the machine to understand
* Splits strings into smaller pieces called tokens 

In [1]:
# download Training Data
from nltk.corpus import twitter_samples

# used to filter tokens for meaningless words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# helps access each token's relative position of a word in a sentence
from nltk.tag import pos_tag

# helps group different forms of words into one token (different conjugations words, etc)
from nltk.stem.wordnet import WordNetLemmatizer

# helps determine the most common words
from nltk import FreqDist

# used to shuffle dataset for model
import random

# used to filter regular expressions from tokens
import re, string

# building and testing the model
from nltk import NaiveBayesClassifier  # algorithm which classifies tweet at positive or negative
from nltk import classify  # used to determine accuracy of classifier

In [2]:
# viewing json files inside twitter_samples data
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [3]:
# # display raw json data of example tweet
import json
for tweet in twitter_samples.docs("positive_tweets.json"):
    print(json.dumps(tweet, indent=2))
    break

{
  "contributors": null,
  "coordinates": null,
  "text": "#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)",
  "user": {
    "time_zone": "Paris",
    "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/784477066/7a8d261ef8d27f2bdf08fadac65bea7b.jpeg",
    "geo_enabled": true,
    "profile_image_url_https": "https://pbs.twimg.com/profile_images/567331322830413825/bqH6u2DO_normal.jpeg",
    "url": "http://t.co/iY2ZZAJY1Y",
    "profile_text_color": "000000",
    "entities": {
      "url": {
        "urls": [
          {
            "url": "http://t.co/iY2ZZAJY1Y",
            "indices": [
              0,
              22
            ],
            "expanded_url": "http://www.international.cci-paris-idf.fr",
            "display_url": "international.cci-paris-idf.fr"
          }
        ]
      },
      "description": {
        "urls": [
          {
            "url": "http://t.co/wGg73YM5yh",
 

In [None]:
# converting tweets to string for easier processing (not used in script)
twitter_samples.strings("positive_tweets.json")[0]

In [None]:
# tokenizing tweets for easiest processing
twitter_samples.tokenized("positive_tweets.json")[0]

In [None]:
# lemmatizing a sentence: normalizing tweet by combining like words (Ex: being=be, members=member)
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(positive_tokens[0]))

In [None]:
# tagging tokens with positional tags used for analysis example
pos_tag(positive_tokens[0])

In [None]:
# removes all non-desired tokens (puncuation, numbers & symbols, stop words)
def clean_data(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        # removing unwanted symbols and patterns from tokens using regular expressions
        token = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+","", token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        # assigning new pos tags for WordNetLemmatizer() function
        if tag.startswith("NN"):
            pos = "n"
        elif tag.startswith("VB"):
            pos = "v"
        else:
            pos = "a"
            
        # lemmatizing tokens (running=run)
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        # dropping puncuation and stop words
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
# words converted to lower case, puncuation and links removed. Text is also lemmatized
cleaned_tokens = remove_noise(positive_tokens[0], stop_words)
print(f"Original Tweet: {positive_tokens[0]}")
print(f"Cleaned Example: {cleaned_tokens}")

In [None]:
# Determining word counts

# compile all cleaned words into one large distribution for analysis
def compile_keywords(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

# combining all tokens into one iterable object (generator)
all_positive_words = compile_keywords(positive_cleaned_tokens_list)
all_negative_words = compile_keywords(negative_cleaned_tokens_list)

# find the most common words using frequency distribution
freq_dist_pos = FreqDist(all_positive_words)
freq_dist_neg = FreqDist(all_negative_words)
print(f"Positive: {freq_dist_pos.most_common(5)}")
print(f"Negative: {freq_dist_neg.most_common(5)}")