Load data before pre-process

In [None]:
import pandas as pd

# load dataset
df = pd.read_csv('full_text.txt', sep='\t', encoding='latin-1', header=None, error_bad_lines=False)

# drop unused columns
df.drop([0, 1, 2], axis=1, inplace=True)

# rename colums
df.columns = ['lat', 'lon', 'text']
#data.rename(columns={"a_sentiment": "sentiment"}, inplace=True)

# drop duplicates rows
df.drop_duplicates(subset=['lat', 'lon'], inplace=True)
df.reset_index(drop=True, inplace=True)
data = df.copy()

# select 10k samples
#data = df.sample(n = 10000, random_state = 100, axis=0, replace=False).reset_index(drop = True)

Process data

In [None]:
# lower casing
#data['text'] = data['text'].str.lower()
# removes emoticons
#data['text'] = data['text'].astype(str).apply(lambda x: " ".join(re.sub(r':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', x) for x in x.split()))

# remove unicode strings
data['text'] = data['text'].str.replace(r"(\\u[0-9A-Fa-f]+)","")
data['text'] = data['text'].astype(str).str.replace(r"[^\x00-\x7f]","")

# convert any url to URL
data['text'] = data['text'].str.replace("((www\.[^\s]+)|(https?://[^\s]+))", "URL")

# remove @Username, RT
data["text"] = data["text"].str.replace("@[^\s]+", "")
data["text"] = data["text"].str.replace(r"RT", "")

# remove punctuations, numbers, special characters except characters and hashtags with space
data["text"] = data["text"].str.replace("[^a-zA-Z#]", " ")

# remove duplicate character
data["text"] = data["text"].str.replace(r"([a-z])(\1{3,})", "")

# remove additional white spaces
data["text"] = data["text"].str.replace("[\s]+", " ")
data["text"] = data["text"].str.replace("[\n]+", " ")
data["text"] = data["text"].str.replace(r"^[\s]", "")
        
# keep text having length more than 5
data = data[data['text'].str.len() > 5]
                          
# remove empty row
data.drop_duplicates(subset='text', keep=False, inplace=True)

# reset index
data.reset_index(drop = True, inplace=True)

# save dataframe to local
data.to_csv('dataset/cleaned.csv', sep=',', index=False)
#data = pd.read_csv('dataset/geo.csv')

'''
Check tweet length
'''

# create empty list
tweet_length = []

# loop through dataset
for i,t in enumerate(df.text):
    # every legth of tweets
    tweet_length.append(len(str(t)))

# drop tweet that less than 15 words
for i,t in enumerate(df.text):    
    if len(str(df.text[i])) < 15:
        df.drop(i, inplace=True)

# average length of tweet
avg_tweet_length = sum(tweet_length)/len(df.text)

df = df[df[len(str(df.text)) > 15]]

Calculate sentiment polarity

In [None]:
'''
vader
'''

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def calculate_vader(sentence):
    score = analyser.polarity_scores(str(sentence))
    return score['compound']

data['vader_polarity'] = data['text'].apply(calculate_vader)

'''
textblob
'''

from textblob import TextBlob

def calculate_blob(sentence):
   blob = TextBlob(str(sentence))
   score = blob.sentiment.polarity
   return score

data['blob_polarity'] = data['text'].apply(calculate_blob)

'''
senticnet
'''

from senticnet.senticnet import SenticNet

sn = SenticNet()

def calculate_sentic(sentence):
    sentic = sn.polarity_intense('i love you')
    return sentic

from sentic import SenticPhrase
sp = SenticPhrase('i love u')
sp.get_polarity()

'''
watsons
'''

from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, SentimentOptions

natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    iam_apikey='',
    url='https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze?version=2019-07-12'
)

def Sentiment_score(input_text): 
    # Input text can be sentence, paragraph or document
    response = natural_language_understanding.analyze (
    text = str(input_text),
    features = Features(sentiment=SentimentOptions())).get_result()
    # From the response extract score which is between -1 to 1
    res = response.get('sentiment').get('document').get('score')
    return res

df['watson_score'] = ''
for i,t in enumerate(df.text):
    response = natural_language_understanding.analyze(text = str(t),features = Features(sentiment=SentimentOptions())).get_result()
    df['watson_score'].iloc[i] = response.get('sentiment').get('document').get('score')

df['watson_score'] = df['text'].apply(Sentiment_score)

def w_s(sen):
    response = natural_language_understanding.analyze(text = str(t),features = Features(sentiment=SentimentOptions())).get_result()
    score = response.get('sentiment').get('document').get('score')
    return score


'''
polyglot
'''

from polyglot.text import Text

def calculate_polyglot(sentence):
    text = Text(str(sentence))
    score = text.polarity
    return score

def qc(sentence):
    text = Text(str(sentence))
    score = text.polarity
    lang = text.language.code
    return score, lang

df['polyglot_score'] = df['text'].apply(calculate_polyglot)


df['polyglot_score'] = ''

for i,t in zip(df.index.values, df.text):
    text = Text(t)
    df['polyglot_score'].loc[i] = text.polarity

# drop row of score 0
for i,x in tqdm(enumerate(data.score)):
    if data.loc[i, 'blob_score'] == 0:
        data.drop(i, inplace=True)

Label sentiment

In [None]:
def classify_asentiment(polarity): # label not balanced
    if(polarity >= 0.5): 
        return int(2) #postive
    elif(polarity <= -0.5): 
        return int(0) #negative
    else:
        return int(1) #neutral

def classify_sentiment(polarity):
    if(polarity > 0): #postive
        return int(2)
    elif(polarity < 0): #negative
        return int(0)
    else: #neutral
        return int(1)
    
# normalize sentiment score and append to dataframe
data['sentiment'] = data['vader_polarity'].apply(classify_sentiment)
df['sentiment'] = df['vader_polarity'].apply(classify_sentiment)

Location processing

In [None]:

'''
Nearby entities - Geonames
'''

from tqdm import tqdm
import requests, json

# create new column in dataframe
df['geonames'] = ''

# loop through index, lat, lon from dataframe
for i,x,y in tqdm(zip(df.index.values[10500:10750], df.lat[10500:10750], df.lon[10500:10750])):
    # convert new column to list
    df['geonames'].iloc[i] = []
    # request data 
    r = requests.get("http://api.geonames.org/findNearbyJSON?lat="+str(x)+"&lng="+str(y)+"&radius=0.3&username="+'') # radius is in kilo meters (km)
    # get text data
    #r.text
    # load to dictionary
    data = json.loads(r.text)
    # calculate length of list
    n = len(data['geonames'])
    # loop through list
    for c in range(0, n):
        # add data to new column
        df['geonames'].iloc[i].append(data['geonames'][c]['fcodeName'])

for i,x,y in tqdm(zip(df.index.values[10750:11000], df.lat[10750:11000], df.lon[10750:11000])):
    # convert new column to list
    df['geonames'].iloc[i] = []
    # request data 
    r = requests.get("http://api.geonames.org/findNearbyJSON?lat="+str(x)+"&lng="+str(y)+"&radius=0.3&username="+'') # radius is in kilo meters (km)
    # get text data
    #r.text
    # load to dictionary
    data = json.loads(r.text)
    # calculate length of list
    n = len(data['geonames'])
    # loop through list
    for c in range(0, n):
        # add data to new column
        df['geonames'].iloc[i].append(data['geonames'][c]['fcodeName'])
        
df.to_csv('neu.csv', sep=',', index=False)

'''
Nearby entities - Google 
'''
from tqdm import tqdm
import requests, json

API_KEY = ''

# create new column in dataframe
df['google'] = ''

# loop through index, lat, lon from dataframe
for i,x,y in tqdm(zip(df.index.values, df.lat, df.lon)):
    # convert new column to list
    df['google'].iloc[i] = []
    # request data 
    r = requests.get("https://maps.googleapis.com/maps/api/place/nearbysearch/json?location="+str(x)+","+str(y)+"&radius=300&key="+API_KEY) # radis is in meters (m)
    # get text data
    r.text
    # load to dictionary
    data = json.loads(r.text)
    # calculate length of list
    n = len(data['geonames'])
    # loop through list
    for c in range(0, n):
        # add data to new column
        df['geonames'].iloc[i].append(data['results'][c]['types'][0]) # get the first type
        
# clean location
df['geonames'] = df['geonames'].astype(str).apply(lambda x: " ".join(re.sub("\[|'|]", '', x) for x in x.split()))
df['geonames'] = df['geonames'].astype(str).apply(lambda x: " ".join(re.sub("^,", '', x) for x in x.split()))
df['geonames'] = df['geonames'].astype(str).apply(lambda x: " ".join(re.sub('[\s]+', '', x) for x in x.split()))

Location processing with Ego Network 1.0

In [None]:
'''
network density = num of actual ties / num of node * (num of node - 1) / 2
closeness centrality = 1 / farness - sum of distance to other nodes
degree centrality = how many ties touch a node
betweeness centralitity = how many times to pass by a node in between
'''

def ego_network_density(categories):
    categories_count = len(categories.split(','))
    density = categories_count / (categories_count + 1 * categories_count / 2) # + 1 is tweet with categories
    return density

def ego_network_closeness(categories):
    categories_count = len(categories.split(','))
    closeness = 1 / categories_count
    return closeness

def ego_network_degree(categories):
    categories_count = len(categories.split(','))
    degree = categories_count
    return degree

df['ego_density'] = df['geonames'].apply(ego_network_density)
df['ego_closeness'] = df['geonames'].apply(ego_network_closeness)
df['ego_degree'] = df['geonames'].apply(ego_network_degree)

Categories count

In [None]:
import pandas as pd

df = pd.read_csv('microblog.csv')

loc = df.geonames.str.get_dummies(',')
place = list(loc)
loc2 = pd.DataFrame(index = place, columns = ['positive', 'neutral', 'negative']).fillna(0)

for i in range(len(df)):
    categories = df.geonames[i].split(',')
    
    if df.sentiment[i] == 0:
        for sub in categories:
            loc2['negative'][sub] += 1
    elif df.sentiment[i] == 1:
        for sub in categories:
            loc2['neutral'][sub] += 1
    elif df.sentiment[i] == 2:
        for sub in categories:
            loc2['positive'][sub] += 1