### Packages Required

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import pickle

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

### Data Preprocessing

In [2]:
def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    #  #NIKE ---> NIKE
    tweet = re.sub(r'#([^\s]+)', r'HASH\1', tweet)
    tweet = re.sub(r'[\.!:\?\-\'\"\\/]', r'', tweet)
    #trim
    
    tweet = tweet.strip('\'"')
    return tweet
#end

#Read the tweets one by one and process it
fp = open('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/SampleTweets.txt', 'r') # r -> read w -> write rw ->
line = fp.readline() # identify a line based on \n

while line:
    processedTweet = processTweet(line)
    print(processedTweet)
    line = fp.readline() # reads till EOF
#end loop
fp.close()

AT_USER hey cici sweetheart just wanted to let u know i luv u oh and will the mixtape drop soon fantasy ride may 5th 
 
AT_USER i heard about that contest congrats girl URL 
 
unc HASHncaa champs franklin st i was there wild and crazy nothing like itever URL 
 
do you share more HASHjokes HASHquotes HASHmusic HASHphotos or HASHnews HASHarticles on HASHfacebook or HASHtwitter 
 
good night HASHtwitter and HASHthelegionofthefallen 545am cimes awfully early 
 
i just finished a 266 mi run with a pace of 1114mi with nike+ gps HASHnikeplus HASHmakeitcount 
 
disappointing day attended a car boot sale to raise some funds for the sanctuary, made a total of 88p after the entry fee  sigh 
 
 
 
no more taking irish car bombs with strange australian women who can drink like rockstarsmy head hurts 
 
just had some bloodwork done my arm arm arm arm hurts


### Feature Vector

In [3]:
#initialize stopWords
stopWords = []

#start replaceTwoOrMore
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

#start getStopWordList
def getStopWordList(stopWordListFileName):
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords
#end

#start getfeatureVector
def getFeatureVector(tweet,stopWords):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector
#end

#Read the tweets one by one and process it
fp = open('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/SampleTweets.txt', 'r')
line = fp.readline()

st = open('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/StopWords.txt', 'r')
stopWords = getStopWordList('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/StopWords.txt')

while line:
    processedTweet = processTweet(line)
    featureVector = getFeatureVector(processedTweet,stopWords)
    print(featureVector)
    line = fp.readline()
#end loop
fp.close()

['hey', 'cici', 'sweetheart', 'luv', 'oh', 'mixtape', 'drop', 'soon', 'fantasy', 'ride']
[]
['heard', 'contest', 'congrats', 'girl']
[]
['unc', 'hashncaa', 'champs', 'franklin', 'st', 'wild', 'crazy', 'itever']
[]
['share', 'hashjokes', 'hashquotes', 'hashmusic', 'hashphotos', 'hashnews', 'hasharticles', 'hashfacebook', 'hashtwitter']
[]
['night', 'hashtwitter', 'hashthelegionofthefallen', 'cimes', 'awfully']
[]
['finished', 'mi', 'run', 'pace', 'gps', 'hashnikeplus', 'hashmakeitcount']
[]
['disappointing', 'day', 'attended', 'car', 'boot', 'sale', 'raise', 'funds', 'sanctuary', 'total', 'entry', 'fee', 'sigh']
[]
[]
[]
['taking', 'irish', 'car', 'bombs', 'strange', 'australian', 'women', 'drink', 'rockstarsmy', 'head', 'hurts']
[]
['bloodwork', 'arm', 'arm', 'arm', 'arm', 'hurts']


In [4]:
# For a bigger training dataset
import csv
#Read the tweets one by one and process it
inpTweets = csv.reader(open('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/SampleTrainingData.csv', 'r'), delimiter=',')
tweets = []
for row in inpTweets:
    sentiment = row[0]
    tweet = row[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet,stopWords)
    tweets.append((featureVector, sentiment));
#end loop

In [5]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features
#end

### Bulk Extraction of Features

In [6]:
#Read the tweets one by one and process it
inpTweets = csv.reader(open('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/SampleTrainingData.csv', 'r'), delimiter=',', quotechar='|')
stopWords = getStopWordList('C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Datasets/StopWords.txt')
featureList = []

# Get tweet words
tweets = []
for row in inpTweets:
    sentiment = row[0]
    tweet = row[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet, stopWords)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment));
#end loop

# Remove featureList duplicates
featureList = list(set(featureList))

# Extract feature vector for all tweets in one shot
training_set = nltk.classify.util.apply_features(extract_features, tweets)

In [7]:
training_set



### Naive Bayes Classifier

**-- Train**

In [8]:
nb_class = nltk.NaiveBayesClassifier.train(training_set)

In [10]:
#Pickle
pickle_out = open("C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Pickle/NBC_Save.pickle","wb")
pickle.dump(nb_class, pickle_out)
pickle_out.close()

In [11]:
pickle_in = open("C:/Users/Nithin/Downloads/Twitter sentiment analysis _ML_Text minning/Pickle/NBC_Save.pickle","rb")
NBClassifier = pickle.load(pickle_in)

**--Test**

In [12]:
test_1 = "I am so glad to use this service. Extremely satisfied and happy with @gateway"
test_2 = "pathetic service by @jetairways. Seat belts aren't proper!"
test_3 = 'Pathetic staff, worse service. Never flying with #AirIndianaJones'
test_4 = "I bought onepluse6t product on 1st Nov and today my phone cameras are not working . Please consider my experience with OnePlus before you buying this product."
test_5 = "Awsome Service No Food was there Waited for two hours. Great!"
test_6 = "The experience was not very good. The servive was OK while the staff was't very great"
test_7 = "Ordered from here before also. Was very happy earlier. Recently they have increased the prices by about 50%, reduced quantity, and quality has also gone down somewhat. No longer value for money great food."
test_8 = "Wonderful"

In [13]:
processedTestTweet = processTweet(test_1)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"positive"


In [14]:
processedTestTweet = processTweet(test_2)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [15]:
processedTestTweet = processTweet(test_3)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [16]:
processedTestTweet = processTweet(test_4)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [17]:
processedTestTweet = processTweet(test_5)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [18]:
processedTestTweet = processTweet(test_6)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [19]:
processedTestTweet = processTweet(test_7)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"negative"


In [20]:
processedTestTweet = processTweet(test_8)
print(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))

"positive"
