In [1]:
import re

In [76]:
class NBAData(object):
    def __init__(self):
        self.path = 'data/'
        
    def get_processed_data(self, dataset='train'):
        data = []
        with open('{}{}'.format(self.path, dataset), 'r') as f:
            for line in f.readlines():
                line = line.strip()
                idx = line.rfind(',')
                data.append((line[1:idx-1], int(line[idx+1:])))
        return data
    
    def get_data(self):
        with open('data/tweets.txt', 'r') as f:
            data = [entry.strip() for entry in f.readlines()]
            return data
    
    def process_data(self, process_data):
        pos_emoji_pattern = r'(:-?(?:\)+|D+))|((?:\(+)-?:<?)|(<3+)'
        neg_emoji_pattern = r'(>?:-?(?:\(+))|((?:D+|\)+)-?:<?)'
        data = []
        for entry in process_data:
            entry = re.sub(r'https?:\/\/.*[\r\n]*', '', entry, flags=re.MULTILINE)
            sentiment = 0
            if re.findall(pos_emoji_pattern, entry):
                sentiment += 1
                entry = re.sub(pos_emoji_pattern, '', entry)
            if re.findall(neg_emoji_pattern, entry):
                sentiment -= 1
                entry = re.sub(neg_emoji_pattern, '', entry)
            data.append((entry, sentiment))
        return data
            
    def analyze_sentiment(self, data):
        correct = 0
        attempts = 0
        incorrect = [('tweet', 'exp_sentiment', 'sentiment')]
        for line in data:
            print(line)
            tweet, sentiment = line
            exp_sentiment = self.get_tweet_sentiment_baseline(tweet)
            if exp_sentiment == sentiment:
                correct += 1
            else:
                incorrect.append((tweet, exp_sentiment, sentiment))
            attempts += 1
        print ('{}% successful: {} correct, {} attempts'.format(correct*100//attempts, correct, attempts))
        return incorrect
            
    def get_tweet_sentiment_baseline(self, tweet):
        from textblob import TextBlob
        analysis = TextBlob(self.clean_tweet(tweet))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 1
        elif analysis.sentiment.polarity < 0:
            return -1
        else:
            return 0
        
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    

In [77]:
model = NBAData()

data = model.get_data()

train_data = model.process_data(data)

print(model.analyze_sentiment(train_data))

('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
('A special victory indeed. Well done, #TeamIndia! Super achievement @chetrisunil11 on your 100th appearance and the 2 goals  #INDvKEN', 1)
("i'm going to school sad, confused and scared  brb", -1)
("i'm going to school sad, confused and scared  brb", -1)
("i'm going to school sad, confused and scar