## Importing the necessary libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
#import sys
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from pprint import pprint
import os
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
import json
#from sklearn.decomposition import PCA, FastICA
#n_comp=10
from sklearn.utils import shuffle
import time



## Reading the training data using pandas

In [2]:
columns=["Sentiment","Unknown","Date and time","Query","Tweeter","Tweet"]
train=pd.read_csv("training.csv", names=columns , encoding='latin-1')
train.head()

Unnamed: 0,Sentiment,Unknown,Date and time,Query,Tweeter,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Value uptil 799999 are negative and rest are positive

In [3]:
train=train.drop(["Unknown","Date and time","Query","Tweeter"],axis=1)
print(train.iloc[799999]['Sentiment'])
print(train.iloc[800000]['Sentiment'])

0
4


In [4]:
train.head()

Unnamed: 0,Sentiment,Tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Language Processing
Using regular expressions to remove unnecessary expressiona and spaces. NLTK stemmer and stopwords to remove and merge certain words. The countvectorizer generates features based on the presence and absence of a word.
### INCREASE VALUE OF "count" VARIABLE TO INCREASE TRAINING DATASET.

In [5]:
count=50000

kickdesc_negative = pd.Series(train.iloc[0:count,:]['Tweet'].tolist()).astype(str)
kickdesc_positive=pd.Series(train.iloc[800000:800000+count,:]['Tweet'].tolist()).astype(str)
# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1
kickdesc_positive = kickdesc_positive.map(desc_clean)
kickdesc_negative = kickdesc_negative.map(desc_clean)

stop = set(stopwords.words('english'))

kickdesc_positive = [[x for x in x.split() if x not in stop] for x in kickdesc_positive]
kickdesc_negative = [[x for x in x.split() if x not in stop] for x in kickdesc_negative]

stemmer = SnowballStemmer(language='english')

kickdesc_positive = [[stemmer.stem(x) for x in x] for x in kickdesc_positive]
kickdesc_negative = [[stemmer.stem(x) for x in x] for x in kickdesc_negative]

kickdesc_positive = [[x for x in x if len(x) > 2] for x in kickdesc_positive]
kickdesc_negative = [[x for x in x if len(x) > 2] for x in kickdesc_negative]

kickdesc_positive = [' '.join(x) for x in kickdesc_positive]
kickdesc_negative = [' '.join(x) for x in kickdesc_negative]

alldesc=kickdesc_negative+kickdesc_positive

cv = CountVectorizer(max_features=100)

#alldesc_positive = cv.fit_transform(kickdesc_positive).todense()
#alldesc_negative = cv.fit_transform(kickdesc_negative).todense()
alldesc = cv.fit_transform(alldesc).todense()
combine = pd.DataFrame(alldesc)

combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)
combine.head()

Unnamed: 0,variable_0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,...,variable_90,variable_91,variable_92,variable_93,variable_94,variable_95,variable_96,variable_97,variable_98,variable_99
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Shuffling and converting dataset to binary format

In [6]:
list_=[]
for c in range(0,count):
    list_.append(0)
for c in range(0,count):
    list_.append(1)
train=pd.DataFrame()
train["Sentiment"]=list_
train=pd.concat([train,combine],axis=1)

train = shuffle(train)
train.head()

Unnamed: 0,Sentiment,variable_0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,...,variable_90,variable_91,variable_92,variable_93,variable_94,variable_95,variable_96,variable_97,variable_98,variable_99
38735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50451,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98518,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59451,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


## XGboost
Using Gradient boosting framework for training first for early stopping rounds and then training with full dataset.

In [7]:
y_train = train["Sentiment"]
print("Training Started")

dtrain = xgb.DMatrix(train.drop('Sentiment', axis=1).iloc[0:int(count*0.8),:], y_train.iloc[0:int(count*0.8)])
dtrain_all= xgb.DMatrix(train.drop('Sentiment', axis=1), y_train)
dval = xgb.DMatrix(train.drop('Sentiment', axis=1).iloc[int(count*0.8):,:], y_train.iloc[int(count*0.8):])
xgb_params = {
    'eta': 0.005,
    'max_depth': 12,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'silent': 1
}

partial_model = xgb.train(xgb_params, dtrain, num_boost_round=3000, evals=[(dval, 'val')],
                      early_stopping_rounds=20, verbose_eval=20)
num_boost_round = partial_model.best_iteration

model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)

Training Started
[0]	val-rmse:0.499793
Will train until val-rmse hasn't improved in 20 rounds.
[20]	val-rmse:0.495553
[40]	val-rmse:0.491975
[60]	val-rmse:0.488699
[80]	val-rmse:0.485955
[100]	val-rmse:0.483328
[120]	val-rmse:0.480956
[140]	val-rmse:0.478887
[160]	val-rmse:0.477023
[180]	val-rmse:0.475343
[200]	val-rmse:0.473853
[220]	val-rmse:0.472498
[240]	val-rmse:0.471258
[260]	val-rmse:0.470136
[280]	val-rmse:0.469108
[300]	val-rmse:0.468166
[320]	val-rmse:0.467275
[340]	val-rmse:0.466475
[360]	val-rmse:0.465741
[380]	val-rmse:0.465052
[400]	val-rmse:0.464426
[420]	val-rmse:0.46382
[440]	val-rmse:0.463256
[460]	val-rmse:0.462719
[480]	val-rmse:0.462219
[500]	val-rmse:0.461749
[520]	val-rmse:0.461294
[540]	val-rmse:0.460869
[560]	val-rmse:0.460483
[580]	val-rmse:0.460125
[600]	val-rmse:0.459773
[620]	val-rmse:0.45943
[640]	val-rmse:0.459128
[660]	val-rmse:0.458822
[680]	val-rmse:0.458543
[700]	val-rmse:0.458275
[720]	val-rmse:0.458016
[740]	val-rmse:0.457783
[760]	val-rmse:0.457547

## Function to extract Sentiment of twitter Data and Class handling the input of data from tweepy.

In [8]:
def sentiment(text):
    temporary=text
    text = text.map(desc_clean)
    text = [[x for x in x.split() if x not in stop] for x in text]
    text = [[stemmer.stem(x) for x in x] for x in text]
    text = [[x for x in x if len(x) > 2] for x in text]
    text = [' '.join(x) for x in text]
    text = cv.fit_transform(text).todense()
    combine_1 = pd.DataFrame(text)
    combine_1.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)
    test=xgb.DMatrix(combine_1)
    preds=model.predict(test)
    y_pred=[]
    for c in preds:
        if(c>=0.5):
            y_pred.append("Positive")
        else:
            y_pred.append("Negative")
    i=0
    while(i<len(temporary)):
        print(temporary[i],"   ",y_pred[i],"  ",preds[i],"\n\n")
        i+=1

class StdOutListener(StreamListener):

    tweet_number=0   # class variable

    def __init__(self,max_tweets):
        self.max_tweets=max_tweets # max number of tweets

    def on_data(self, data):
        self.tweet_number+=1   
        try:
            decoded = json.loads(data)
            #print(decoded['text'].encode('latin1', 'ignore'))
            A.append(decoded['text'].encode('latin1', 'ignore'))
            #sentiment([decoded['text'].encode('latin1', 'ignore')])
            #print(decoded['text'].encode('latin1', 'ignore'))
            
        except BaseException:
            print('Error')
            pass
        except IncompleteRead:
        # Oh well, reconnect and keep trucking
            print('Error')
            pass
        except ProtocolError:
        # Oh well, reconnect and keep trucking
            print('Error')
            pass
        except KeyboardInterrupt:
        # Or however you want to exit this loop
            stream.disconnect()
        if self.tweet_number>=self.max_tweets:
            #sys.exit('Limit of '+str(self.max_tweets)+' tweets reached.')
            if(len(A)>=self.max_tweets):
                return False
    def on_error(self, status):
        print ("Error " + str(status))
        if status == 420:
            print("Rate Limited")
            return False

## Using Tweepy
Get consumer key,consumer_key_secret,accces_token and accent_token_secret from twitter developer page.
The values vary as :- 0 is negative and 1 is postive and intermediary values have mixed emotions.

In [9]:
consumer_key=""
consumer_secret=""

access_token= ""
access_token_secret= ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

A=[]
i=0
while(i<4):
    A=[]
    l = StdOutListener(20)
    stream = Stream(auth, l)
    stream.filter(languages=["en"],track=["a", "the","you"])
    sentiment(pd.Series(A).astype(str))
    time.sleep(40)
    i+=1

b'@Josewittaph Hi I have a question'     Positive    0.750285 


b'RT @syeddoha: Ro Nay San Lwin, if you are lecturing #Bangladesh, please leave the country as no one is interested in your message '     Positive    0.710377 


b'I love you dad https://t.co/YEEPPW0E9N'     Positive    0.892878 


b'@mindingthegaap Like, I\'m pretty sure it\'s like "haha, look, a sausage pizza! We\'re having a--"\n\nNo.\n\nWHY MUST WE GENDER EVERYTHING'     Negative    0.229331 


b'RT @hemlockspidey: one year and nine episodes later: \n- went from an asshole to a mother of four \n- nancy doesnt deserve him \n- had o '     Positive    0.787309 


b'When the teacher born and raised IE https://t.co/jijsLXqmpJ'     Positive    0.516939 


b'RT @rallystarters: @exoticgamora @TheSWPrincess @debbiesideris @SpockResists @MrScottLads @NatCookResists @Alyssa_Milano @Havok_2017 '     Negative    0.166474 


b'RT @ICHRI: #Iran\'s Intelligence Ministry is "inviting" Sunni politicians to hotels and tell them to stop 