In [1]:
import json
import xlwt
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob
import nltk

In [2]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

# tokenize the text called by preprocess
def tokenize(s):
    return tokens_re.findall(s)

# stopwords removal
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

# remove stopwords called by preprocess
def removeStopWords(s):
    notStopword = [word for word in s if word not in stop]
    return notStopword

# Porter Stemmer
ps = nltk.PorterStemmer()
def stemming(s):
    text = [ps.stem(word) for word in s]
    return text

#preprocess the text
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    stop_word_removed = removeStopWords(tokens)
    stemmed_text = stemming(stop_word_removed)
    return " ".join(stemmed_text)

# sentiment extract
def sentiment_value(s):
    if(TextBlob(s).sentiment.polarity > 0):
        return 'postive'
    elif(TextBlob(s).sentiment.polarity < 0):
        return 'negative'
    else:
        return 'neutral'
    
# write data to excel
def write_to_excel(df):
    writer=pd.ExcelWriter('labelled_tweets.xlsx')
    df.to_excel(writer,'sheet2')
    writer.save()


In [3]:
df = pd.read_json('async_data_v2.json',lines=True)
dataset = df.filter(['text'],axis=1)
dataset['processed_text']=dataset['text'].apply(lambda x : preprocess(x,True))
dataset['sentiment']=dataset['processed_text'].apply(lambda x : sentiment_value(x))
write_to_excel(dataset)
print(dataset.head(10))

                                                text  \
0  RT @ok_grow: GraphQL Schema Design: Building E...   
1  RT @SennoGroup: Sentiment Analysis for Everyon...   
2  RT @SennoGroup: Sentiment Analysis for Everyon...   
3  RT @SennoGroup: Sentiment Analysis for Everyon...   
4  RT @CrowdConscious: The team at @binance are s...   
5  RT @SMACCloud: You don't need to leave the #of...   
6  RT @SennoGroup: Sentiment Analysis for Everyon...   
7  RT @SennoGroup: Sentiment Analysis for Everyon...   
8  Exposing #AWS #Lambda #functions with the #API...   
9  RT @CrowdConscious: The team at @binance are s...   

                                      processed_text sentiment  
0  @ok_grow graphql schema design build evolv sch...   neutral  
1  @sennogroup sentiment analysi everyon » https:...   neutral  
2  @sennogroup sentiment analysi everyon » https:...   neutral  
3  @sennogroup sentiment analysi everyon » https:...   neutral  
4  @crowdconsci team @binanc shake thing #trade c...   neu