## Imports

In [None]:
import nltk, re, string, random
from nltk.tag import pos_tag
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Load Data

In [None]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

text = twitter_samples.strings('tweets.20150430-223406.json')
tokens = twitter_samples.tokenized('positive_tweets.json')

In [None]:
positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [None]:
tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [None]:
negative_tweets[4]

'Dang starting next week I have "work" :('

In [None]:
len(positive_tweets),len(negative_tweets)

(5000, 5000)

In [None]:
text[0] #Doesn't Classify

'RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP'

## PreProcessing

In [None]:
#POS Tagging  --> take tokenize as a argument
pos_tag(tokens[0])

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

In [None]:
#Using Lemmatizer to get the base form (e.g. running --> run)
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []

    for word,tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word,pos))
    return lemmatized_sentence

In [None]:
print('Real Word:      ',tokens[0])
print('Lemmatize Word: ',lemmatize_sentence(tokens[0]))

Real Word:       ['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
Lemmatize Word:  ['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [None]:
def remove_noise(tweet_tokens):

    stop_words = set(stopwords.words('english'))
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):

        token = re.sub('[^A-Za-z\s0-9]', '', token )

        token = re.sub('http\S+|www\.\S+', '', token)
        token = re.sub('@\w+', '', token)
        token = re.sub('\s+', ' ', token)


        if tag.startswith('NN'):
            pos = 'n'

        elif tag.startswith('VB'):
            pos = 'v'

        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:

            cleaned_tokens.append(token.lower())

    return cleaned_tokens

In [None]:
remove_noise(tokens[300])

['stats', 'day', 'arrive', '2', 'new', 'follower', 'unfollowers', 'via']

In [None]:
tokens[300]

['Stats',
 'for',
 'the',
 'day',
 'have',
 'arrived',
 '.',
 '2',
 'new',
 'followers',
 'and',
 'NO',
 'unfollowers',
 ':)',
 'via',
 'http://t.co/xxlXs6xYwe',
 '.']

In [None]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [None]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []


for tokens in positive_tweet_tokens:

    positive_cleaned_tokens_list.append(remove_noise(tokens))

for tokens in negative_tweet_tokens:

    negative_cleaned_tokens_list.append(remove_noise(tokens))

In [None]:
positive_tweet_tokens[500], positive_cleaned_tokens_list[500]

(['Dang',
  'that',
  'is',
  'some',
  'rad',
  '@AbzuGame',
  '#fanart',
  '!',
  ':D',
  'https://t.co/bI8k8tb9ht'],
 ['dang', 'rad', 'abzugame', 'fanart'])

In [None]:
def get_all_words(cleaned_tokens_list):

    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [None]:
all_pos_words

<generator object get_all_words at 0x7f5a73b66180>

In [None]:
feq_dist_pos = FreqDist(all_pos_words)
print(feq_dist_pos.most_common(10))

[('thanks', 389), ('follow', 361), ('love', 335), ('good', 283), ('get', 263), ('thank', 253), ('u', 245), ('day', 242), ('im', 237), ('like', 229)]


In [None]:
def get_tweets_for_model(cleaned_tokens_list):


    for tweet_tokens in cleaned_tokens_list:
            yield dict(  [token, True] for token in tweet_tokens  )


positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [None]:
positive_tokens_for_model

<generator object get_tweets_for_model at 0x7f5a61897e60>

In [None]:
negative_tokens_for_model

<generator object get_tweets_for_model at 0x7f5a61897840>

In [None]:
positive_dataset = [ (tweet_dict,"Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [ (tweet_dict,"Negative") for tweet_dict in negative_tokens_for_model]

In [None]:
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

In [None]:
dataset[10]

({'guevarawr': True,
  'pretty': True,
  'sure': True,
  'drown': True,
  'ice': True,
  'cream': True},
 'Negative')

## Build Model

In [None]:
train_data = dataset[:7000]
test_data = dataset[7000:]

In [None]:
classifir = NaiveBayesClassifier.train(train_data)

## Evaluate

In [None]:
classify.accuracy(classifir,test_data)

0.7516666666666667

In [None]:
classifir.show_most_informative_features(10)

Most Informative Features
                follower = True           Positi : Negati =     38.5 : 1.0
                     sad = True           Negati : Positi =     25.3 : 1.0
                      ff = True           Positi : Negati =     24.0 : 1.0
                     bam = True           Positi : Negati =     22.7 : 1.0
                  arrive = True           Positi : Negati =     19.1 : 1.0
                     pls = True           Negati : Positi =     18.6 : 1.0
                       p = True           Positi : Negati =     18.6 : 1.0
               community = True           Positi : Negati =     16.1 : 1.0
                followed = True           Negati : Positi =     15.6 : 1.0
              appreciate = True           Positi : Negati =     15.5 : 1.0


## Predict

In [None]:
random_tweet = "@omar I'm happy to connect you!"
cleaning = remove_noise(word_tokenize(random_tweet))

print(cleaning)
print(classifir.classify( dict([token, True] for token in  cleaning )))

['omar', 'happy', 'connect']
Positive


In [None]:
random_tweet = "I'm very upsate"
cleaning = remove_noise(word_tokenize(random_tweet))

print(cleaning)
print(classifir.classify( dict([token, True] for token in  cleaning )))

['upsate']
Positive
