### DSPT6 - Adding Data Science to a Web Application

The purpose of this notebook is to demonstrate:
- Simple online analysis of data from a user of the Twitoff app or an API
- Train a more complicated offline model, and serialize the results for online use

In [1]:
import sqlite3
import pickle
import pandas as pd

In [2]:
# Connect to sqlite database
conn = sqlite3.connect('C:\\Users\\bruno\\Desktop\\twitoff.sqlite3')

In [4]:
def get_data(query, conn):
    '''Function to get data from SQLite DB'''
    
    cursor = conn.cursor()
    result = cursor.execute(query).fetchall()

    # Get columns from cursor object
    columns = list(map(lambda x: x[0], cursor.description))

    # Assign to DataFrame
    df = pd.DataFrame(data=result, columns=columns)
    return df

In [9]:
import pickle

sql = '''
SELECT 
	tweet.id,
	tweet.tweet, 
	tweet.embedding,
	user.username
FROM tweet
JOIN user on tweet.user_id = user.id;
'''

df = get_data(sql, conn)
df['embedding_decoded'] = df.embedding.apply(lambda x: pickle.loads(x))
print(df.shape)
df.head(3)

(12291, 5)


Unnamed: 0,id,tweet,embedding,username,embedding_decoded
0,23048456524,"Worth a read, I liked this Economist article a...",b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,billgates,"[-0.07537176, 0.27606362, -0.0547825, 0.014415..."
1,23048600560,"Another interesting article, this one on a pro...",b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,billgates,"[-0.14114963, 0.17626844, -0.18227336, 0.10240..."
2,24514937071,The Guardian has partnered with the foundation...,b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,billgates,"[-0.002723366, 0.10334747, -0.100752264, 0.025..."


In [10]:
df.username.value_counts()

billgates      2889
barackobama    2766
jimmyfallon    2353
KingJames      2193
nasa           1693
elonmusk        397
Name: username, dtype: int64

In [13]:
import numpy as np

user1_embeddings = df.embedding_decoded[df.username == 'barackobama']
user2_embeddings = df.embedding_decoded[df.username == 'jimmyfallon']
embeddings = pd.concat([user1_embeddings, user2_embeddings])

embeddings_df = pd.DataFrame(embeddings.to_list(),
                             columns=[f'dom{i}' for i in range(300)])
labels = np.concatenate([np.ones(len(user1_embeddings)),
                         np.zeros(len(user2_embeddings))])
print(embeddings_df.shape, labels.shape)

(5119, 300) (5119,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test