In [1]:
import json
import googlemaps # This had to be installed
import twitter # This should already be installed (but was additional)   
import urllib.parse as urllib
import pandas as pd
import nltk
import numpy as np
import itertools as itr
import re
import string
import random
import time
import datetime
import csv
import math
from twitter import *

In [2]:
# Import Bokeh Packages
from bokeh.layouts import row, column, widgetbox, layout
from bokeh.models.widgets import Button, TextInput, Select, Div, DataTable, TableColumn, NumberFormatter, Panel, Tabs
from bokeh.models import HoverTool, ColumnDataSource, GMapOptions
from bokeh.plotting import show, figure, gmap
from bokeh.io import show, push_notebook, output_notebook, reset_output
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.models.tiles import WMTSTileSource
from bokeh.document import Document
from bokeh.io import output_file, show
from bokeh.models.widgets import CheckboxGroup

In [3]:
CREDFILE = 'OAuth_Keys.json'
GOOGLE_MAPS_API_URL = 'http://maps.googleapis.com/maps/api/geocode/json'
RATE_LIMIT = 25
NO_FETCHES="5"
LOADING_IMAGE = 'data/Wedges-3s-200px.gif'
SAMPLEFILE = 'data/sentiment_sample.json'
DEFAULT_GEO = 'Syracuse, NY'
PADDING = 0.1
APP_WIDTH = 650
APP_HEIGHT = 700
location = TextInput(value="Syracuse", title='Search Location:',sizing_mode='scale_width')
emoList = ['joy','fear','anger','sadness','disgust','shame','guilt','neutral']
checkbox_group = CheckboxGroup(labels=emoList, active=[0, 1,2,3,4,5,6,7])

In [4]:
import pandas as pd
import numpy as np
from copy import copy
import pickle
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def build_dataset(df,emotion):
    emotion_df = copy(df)
    is_not_emotion = emotion_df.Emotion != emotion
    emotion_df.loc[is_not_emotion, 'Emotion'] = 'no'+ emotion
    print(emotion_df.Emotion.value_counts())
    return emotion_df

def stem_data(text_series):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    class StemmedCountVectorizer(CountVectorizer):
        def build_analyzer(self):
            analyzer = super(StemmedCountVectorizer, self).build_analyzer()
            return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
    X_counts = stemmed_count_vect.fit_transform(text_series)
    return X_counts,stemmed_count_vect

def tfidf_transform(X_counts):
    tfidf_transformer = TfidfTransformer()
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print(X_tfidf.shape)
    return X_tfidf,tfidf_transformer

def over_sample(y, X_tfidf):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_tfidf, y)
    unique, counts = np.unique(y_res, return_counts=True)
    print(dict(zip(unique, counts)))
    return X_res, y_res

def classify(X, y):
    print("Training the model")
    clf = RandomForestClassifier(n_estimators = 100).fit(X, y)
    return clf


In [5]:
import os.path
def train_models(trainAll):
    df = pd.read_csv('DATA.csv', dtype='str')
    emotion_range = {'joy': 'model_joy.pkl', 'fear': 'model_fear.pkl', 'anger': 'model_anger.pkl', 'sadness': 'model_sadness.pkl',
                    'disgust': 'model_disgust.pkl', 'shame': 'model_shame.pkl', 'guilt': 'model_guilt.pkl'}
    
    X_counts,stemmed_count_vect = stem_data(df.Text)
    X_tfidf, tfidf_transformer = tfidf_transform(X_counts)
       
    for emo, filename in emotion_range.items():
        if ((not os.path.isfile(filename)) or trainAll):
            emotion_df = build_dataset(df, emo)
            X, y = over_sample(emotion_df.Emotion, X_tfidf)
            model = classify(X, y)
            with open(filename, 'wb') as file:  
                pickle.dump(model, file)
    return stemmed_count_vect, tfidf_transformer

In [6]:
#main classification function which is called by the ui function
def emotion_Classifier(dataset):
    emotion_range = {'joy': 'model_joy.pkl', 'fear': 'model_fear.pkl', 'anger': 'model_anger.pkl', 'sadness': 'model_sadness.pkl',
                        'disgust': 'model_disgust.pkl', 'shame': 'model_shame.pkl', 'guilt': 'model_guilt.pkl'}
    result_dict = {'joy':0, 'fear':0, 'anger':0, 'sadness':0, 'disgust':0, 'shame':0, 'guilt':0, 'neutral':0}
    stemmed_count_vect, tfidf_transformer = train_models(False)
    
    regex_tags = [r'@\w+',r'']
    regex_hashtag = [r'#\w+',r'']
    dataset.insert(2, 'emotion', value = None)
    
    pickle_model = {}
    for emo, filename in emotion_range.items():
        with open(filename, 'rb') as file:
            pickle_model[emo] = pickle.load(file)
            
    for index, row in dataset.iterrows():
        input_array = row.text
        input_array = re.sub(regex_tags[0], regex_tags[1],input_array)
        input_array = re.sub(regex_hashtag[0], regex_hashtag[1],input_array)
        input_series = pd.Series(input_array)
        X_counts = stemmed_count_vect.transform(input_series)
        X_counts = tfidf_transformer.transform(X_counts)
        prediction_probabilities = {}
        
        for emo, filename in emotion_range.items():
            prob = pickle_model[emo].predict_proba(X_counts)
            pred = pickle_model[emo].predict(X_counts)
            prediction_probabilities[pred[0]] = np.max(prob)
        result = max(prediction_probabilities, key=lambda key: prediction_probabilities[key] if 'no' not in key else 0)
        result = result if 'no' not in result else 'neutral'
        result_dict[result] += 1
        dataset.loc[index].emotion = result
    return result_dict, dataset

In [None]:
#helperfunction to process the keys from authetication file
def getKeys(filename):
    with open(filename,'r') as fd:
        keys = json.load(fd)
    return keys

In [None]:
keys = getKeys(CREDFILE)

In [7]:
def initGoogle():
    gmaps = googlemaps.Client(key=keys['GoogleKey'])
    return gmaps

In [8]:
def initTwitter():
     api = twitter.Api(consumer_key = keys['Key'], 
                consumer_secret = keys['SKey'], 
                access_token_key = keys['Token'], 
                access_token_secret = keys['SToken'],
                sleep_on_rate_limit=True)
    
    return api

In [9]:
#update of ui
def manageUi(pieChart,graph,table):
    
    if graph == None:
          graph = Div(text='<div align="center" style="display:block"><h2>None of the tweets have location coordinates enabled</h2><br><br><br></div>', width=500)
          
    emoList = ['joy','fear','anger','sadness','disgust','shame','guilt','neutral']
    tab1 = Panel(child=graph, title="map")
    tab2 = Panel(child=pieChart, title="Pie chart")
    tab3 = Panel(child=table, title="Table")
    tabs = Tabs(tabs=[ tab1, tab2,tab3 ])
    location = TextInput(value="Syracuse", title='Search Location:',sizing_mode='scale_width')
    submit = Button(label='Process Tweets', button_type='success')
    processEmoB = Button(label='Process Emotions', button_type='success')
    processEmoB.on_click(update)
    show(row(column(location,submit,widgetbox(checkbox_group),processEmoB),tabs))


In [10]:
def initializeUi():
    output_file("Working.html")
    reset_output()
    update()

In [11]:
def rawQuery(lat, long, radius):
    raw =[lat,long,radius+"mi"]
    return raw
#this helper method returns the geocode
def geocode(loc, api):
    result = api.geocode(loc)
    if (result):
        result = result[0]['geometry']['location']
    else:
        result = None
    return result


#table creation to output emotion:tweet
def tableCreation(emoDataDic,value):
    emoList = ['joy','fear','anger','sadness','disgust','shame','guilt','neutral']
    value2 =list()
    tweetList =list()
    emotionList =list()
    while(len(value)) !=0:
        value2.append(value.pop(0))
    for j in value2 :
        for index, r in emoDataDic[emoList[j]].iterrows():
            tweetList.append(r["text"])
            emotionList.append(r["emotion"])
    data = dict(Tweet=tweetList,Emotion=emotionList)       
    source = ColumnDataSource(data)
    columns = [
        TableColumn(field="Tweet", title="Tweet"),
        TableColumn(field="Emotion", title="Emotion"),]
    data_table = DataTable(source=source, columns=columns, width=1100,fit_columns=False,height=500)
    return widgetbox(data_table)

#process the twitter query
def processQuery(api, raw):
    switch = 1
    i = 0
    tweets = list();
    while (switch == 1):
        results = api.GetSearch(raw_query = raw, return_json=True)
        i += 1
        tweets.append(results)
        if ('next_results' in results['search_metadata'].keys()):
            raw = results['search_metadata']['next_results']
            temp = raw[1:].split('&q=')
            raw = '&q=' + temp[1] + '&' + temp[0]
        else:
                switch = 0
        
        if (i == RATE_LIMIT):
            switch = 0
    
    return tweets
#extract data from the tweeter raw output received
def extractData(results):
    records = list()
    labels = ['text','geo']
    k=0
    for i in range(len(results)):
        for j in range(len(results[i]['statuses'])):
            if ('text' not in results[i]['statuses'][j].keys()):
                    text = results[i]['statuses'][j]['full_text']
                    
            else:
                text = results[i]['statuses'][j]['text'] 
            geo = results[i]['statuses'][j]['geo'] 
            records.append([text,geo])
    datasets = pd.DataFrame.from_records(records, columns = labels)   
    return datasets  

#plotting the piechart
def piechart(emoPieChart):
    data = pd.Series(emoPieChart).reset_index(name='value').rename(columns={'index':'emotion'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = Category20c[len(emoPieChart)]

    plot = figure(plot_height=350, title="Pie Chart", toolbar_location=None,
                tools="hover", tooltips="@emotion: @value")
    plot.wedge(x=0, y=1, radius=0.4,
                start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
                line_color="white", fill_color='color', legend='emotion', source=data)
    return plot 
    

def plotGeo(dataset,latLong):
    map_options = GMapOptions(lat=latLong["lat"], lng=latLong["lng"], map_type="roadmap", zoom=11)
    sMap = gmap(keys['GoogleKey'], map_options, title="Map")
    emotionList = ["joy","fear","anger","sadness","disgust","shame", "guilt","neutral"]
    emoColourDict = {"joy":"red","fear":"blue","anger":"black","sadness":"red","disgust":"green","shame":"grey", "guilt":"red","neutral":"blue"}
    emoCoordDict ={} #dic of emotion to list(lat and longitude)
    emoPieChart ={}
    emoDataDict ={} #dic of emotion to dataset
    for i in emotionList:
        emoDataDict[i] = dataset[dataset['emotion'] == i]
        emoPieChart[i]=len(emoDataDict[i])
       #loop through emotion list to make a emotion:coordinate dictionary
    for i in emotionList:
        theList = list(filter(None, emoDataDict[i]['geo']))
        if len(theList)!=0:
            emoCoordDict[i] =list()
            for k in range(len(theList)):
                emoCoordDict[i].append(theList[k]['coordinates'])
           
    latList =list()
    longList =list()
    if len(emoCoordDict)!=0:
        print(emoCoordDict)
        # loop through a dict{emotion:correspondingdataset}to get the list of latitudes and longitudes to be sent to the map object
        for i in emoCoordDict.keys():
            
            for j in range(len(emoCoordDict[i])):
                latList.append(emoCoordDict[i][j][0])
                longList.append(emoCoordDict[i][j][1])
            source = ColumnDataSource(
            data=dict(lat=latList,
              lon=longList))
            sMap.circle(x="lon", y="lat", size=15, fill_color=emoColourDict[i], fill_alpha=0.5, source=source)
    return emoPieChart,emoDataDict,sMap  

In [12]:
from math import pi

import pandas as pd
from bokeh.io import output_file, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum
from bokeh.models import Button
from bokeh.plotting import figure, curdoc

def update(): 
    loc = "syracuse"
    emotions = list()
    value=checkbox_group.active
    #creating a list of a bokeh list type
    while len(value)!=0:
        emotions.append(value)
    tAPI = initTwitter()
    gAPI = initGoogle()
    latlong = geocode(loc, gAPI)
    raw='q=geocode%3A' + str(latlong['lat']) + '%2C' + str(latlong['lng']) + '%2C' + "5mi" + '&lang=en&result_type=recent&include_entities=true&count=50&tweet_mode=extended'
    results = processQuery(tAPI, raw)
    dataset = extractData(results)
    time.sleep(5)
    result_dict, dataset = emotion_Classifier(dataset)
    emoPieChart,emoDataDict,map = plotGeo(dataset,latlong)
    pieChartVar =piechart(emoPieChart)  
    tableVar=tableCreation(emoDataDict,emotions)
    manageUi(pieChartVar,map,tableVar)




In [None]:
#main function that runs the entire program
initializeUi() 
