# Moving Our Twitter Inference Towards Production

![](https://www.softwareadvice.com/resources/wp-content/uploads/The-Best-Free-Tools-for-Twitter-Sentiment-Analysis-Tile.png)

# The Question:

### What is the probability that a tweet originating from within Colombia contains at least 1 occurence of the word "yo" with any given composition of accents, and capital and lowercase letters?

# Gather data (Extract)

In [1]:
import os
import tweepy
from tweepy import Stream

### Load credentials

In [2]:
#Guardar en variables para no dejar las key expuestas

CONSUMER_KEY = "7IcU2nins8i2xporqmdB9jSvd"
CONSUMER_SECRET = "wIsiRpZaOHsMPRckbjmZwo44fsLql1VyKsUiDQzd6rN978rmxt"

ACCESS_TOKEN = "200195839-3xrk66Par46KPQRU9jbbINXDC7Hd2x9HJS2oDSWG"
ACCESS_TOKEN_SECRET = "2Q7NswDvZMjDRwI1b6NdlynMlcVfaWyV4emUe5g3PqlXf"

In [3]:
COLOMBIA_GEO_LOCATION_BOUNDING_BOX = [-78.31, 0.44, -70.71, 11.39]

### Create connection with twitter API 

In [4]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

# Define some cleaning functions (Transform)

In [5]:
from unidecode import unidecode


def make_lowercase(tweet):
    return tweet.lower()


def remove_diacritics(tweet):
    return unidecode(tweet)


def remove_non_alpha_characters(tweet):
    return ''.join(character for character in tweet if character.isalpha() or character == ' ')

# Load Data

In [6]:
import pyodbc # Library to manage odbc connection in this case we work with SQl SERVER

CNXN_STRING = "Driver=SQL Server;Server={SERVER};Database={DB_NAME};Trusted_Connection=Yes;"
# you can also swap Trusted_Connection for UID={your username};PWD={your password}

#Parameters of database
DB_NAME = "twitter_inference"
TBL_NAME ="tweets"
SERVER = "bielondono"

CNXN_STRING.format(SERVER=SERVER, DB_NAME=DB_NAME)



'Driver=SQL Server;Server=bielondono;Database=twitter_inference;Trusted_Connection=Yes;'

### Optional Create DB and Table  from python

In [12]:
#Create DB

cnxn_master = pyodbc.connect(CNXN_STRING.format(SERVER=SERVER, DB_NAME="master"), autocommit=True)
cnxn_master.cursor().execute("IF EXISTS(SELECT * FROM sys.databases WHERE [name] = '{0}') DROP DATABASE {0}".format(DB_NAME))
cnxn_master.cursor().execute("CREATE DATABASE " + DB_NAME)
cnxn_master.close()

print("Database created")

# Creating Table

Script_Create_TBL = "CREATE TABLE {0} (id_str varchar(50), text varchar(500))".format(TBL_NAME)

cnxn_db = pyodbc.connect(CNXN_STRING.format(SERVER=SERVER, DB_NAME=DB_NAME), autocommit=True)
cnxn_db.cursor().execute("IF EXISTS(select* from INFORMATION_SCHEMA.TABLES WHERE [TABLE_NAME] = '{0}') DROP TABLE {0}".format(TBL_NAME))
cnxn_db.cursor().execute(Script_Create_TBL)
cnxn_db.close()

print("New Table Created: {0}".format(TBL_NAME))



Database created
New Table Created: tweets


## Setting the Stream Listener twitter api

In [15]:
from tweepy import StreamListener


class PersistedStreamListener(StreamListener):
    
    def __init__(self):
        self._database_connection = pyodbc.connect(CNXN_STRING.format(SERVER=SERVER, DB_NAME=DB_NAME), autocommit=True)
        super().__init__()
    
    def on_status(self, status):
        cleaned_status_text = self._clean_status_text(status.text)
        self._insert_status(id_str=status.id_str, text=cleaned_status_text)
        
    def _clean_status_text(self, status_text):
        cleaned_status_text = status_text
        for cleaning_function in self._cleaning_functions:
            cleaned_status_text = cleaning_function(cleaned_status_text)
        return cleaned_status_text
    
    def _insert_status(self, id_str, text):
        cursor = self._database_connection.cursor()
        insert_statement = """INSERT INTO {table_name} VALUES ('{id_str}', '{text}')""".format(
                table_name=TBL_NAME, id_str=id_str, text=text)

        cursor.execute(insert_statement)
        self._database_connection.commit()
        
        cursor.close()
        
    @property
    def _cleaning_functions(self):
        return [make_lowercase, remove_diacritics, remove_non_alpha_characters]

## Activate de persisted stream listener (Load)

In [16]:
streaming_api = Stream(auth=auth, listener=PersistedStreamListener())

#filter the stream api by location

streaming_api.filter(locations=COLOMBIA_GEO_LOCATION_BOUNDING_BOX, async=True)

# Modeling data in stream

In [17]:
import numpy as np
from scipy.stats import beta as beta_distribution

X_VALUES = np.linspace(0, 1, 1002)[1:-1]
DATABASE_CONNECTION = pyodbc.connect(CNXN_STRING.format(SERVER=SERVER,DB_NAME=DB_NAME), autocommit=True)
KEYWORD ="yo"

def fetch_tweets(database_connection=DATABASE_CONNECTION):
    cursor = database_connection.cursor()
    select_statement = """SELECT text FROM {table}""".format(table=TBL_NAME)
    cursor.execute(select_statement)
    result = cursor.fetchall()
    
    return [tweet[0] for tweet in result]


def compute_alpha_and_beta(tweets, keyword=KEYWORD):
    number_of_occurences = sum(keyword in tweet for tweet in tweets)
    alpha = 1 + number_of_occurences
    beta = 1 + (len(tweets) - number_of_occurences)
    
    return alpha, beta


def compute_pdf_y_values(alpha, beta, x_values=X_VALUES):
    return beta_distribution(alpha, beta).pdf(x_values)

# Monotiring data live

In [18]:
from bokeh.client import push_session
from bokeh.models import FixedTicker
from bokeh.plotting import figure, curdoc, reset_output

# reset output
reset_output()

# initialize alpha, beta
tweets = fetch_tweets()
alpha, beta = compute_alpha_and_beta(tweets=tweets)
pdf_y_values = compute_pdf_y_values(alpha, beta)

# create bokeh figure
bokeh_figure = figure(
    title='PDF of True Probability of a Tweet Containing Keyword',
    x_axis_label='true_probability',
    y_axis_label='probability_density',
    width=1000,
    height=600
)
bokeh_figure.xaxis[0].ticker=FixedTicker(ticks=list(np.linspace(0, 1, 21)))
bokeh_line = bokeh_figure.line(X_VALUES, pdf_y_values, color="navy", line_width=4)

# open a session to keep our local document in sync with server
session = push_session(curdoc())

def update():
    tweets = fetch_tweets()
    alpha, beta = compute_alpha_and_beta(tweets=tweets)
    pdf_y_values = compute_pdf_y_values(alpha, beta)
    bokeh_line.data_source.data.update(y=pdf_y_values)

curdoc().add_periodic_callback(update, 100)

session.show(bokeh_figure)

session.loop_until_closed()


    !!!! PLEASE NOTE !!!!

The use of `session.loop_until_closed` and `push_session` to run Bokeh
application code outside a Bokeh server is **HIGHLY DISCOURAGED** for any real
use.

Running application code outside a Bokeh server with bokeh.client in this way
has (and always will have) several intrinsic drawbacks:

* Fast binary array transport is NOT available! Base64 fallback is much slower
* All network traffic is DOUBLED due to extra hop between client and server
* Server *and* client process must be running at ALL TIMES for callbacks to work
* App code run outside the Bokeh server is NOT SCALABLE behind a load balancer

The bokeh.client API is recommended to use ONLY for testing, or for customizing
individual sessions running in a full Bokeh server, before passing on to viewers.

For information about different ways of running apps in a Bokeh server, see:

    http://bokeh.pydata.org/en/latest/docs/user_guide/server.html

