## NLP Basics Tutorial
* https://www.digitalocean.com/community/tutorials/how-to-work-with-language-data-in-python-3-using-the-natural-language-toolkit-nltk

In [1]:
import nltk

In [3]:
# Download twitter dataset
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/loganbon/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [5]:
# Download part-of-speech tagger. Labels words as corresponding to a particular tag: noun, verb, adjective, adverb, etc
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/loganbon/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [10]:
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents

In [7]:
# See what json's exist in the corpus
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [8]:
# See example of tweets inside one json
twitter_samples.strings("tweets.20150430-223406.json")

['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP',
 'VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY',
 'RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…',
 'RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1',
 "RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants.  http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…",
 'RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2',
 'RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?!  #bbcqt vote @Conservatives',
 "RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",
 "RT

## Analysis

In [15]:
# tokenizing data for better processing
positive_tweets_raw = twitter_samples.strings("positive_tweets.json")
positive_tweets_tokenized = twitter_samples.tokenized("positive_tweets.json")
# tag each token with the proper POS (jj=adjective, NN=singular noun)
tagged_tweets = pos_tag_sents(positive_tweets_tokenized)
tagged_tweets[0]

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

In [20]:
# counting nouns and adjectives
jj_count = 0
nn_count = 0
for tweet in tagged_tweets:
    for pair in tweet:
        tag = pair[1]
        if tag == 'JJ':
            jj_count += 1
        elif tag == 'NN':
            nn_count += 1

In [21]:
jj_count

6092

In [22]:
nn_count

13181