In [1]:
import os
import re
import numpy as np
import pandas as pd
import gensim
import collections
from nltk.corpus import stopwords
import nltk

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists, drop_database
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

  "is going to be overriden.".format(identifier))


In [2]:
# using function from "Keeping API Keys Secret.ipynb"
# by https://github.com/dylburger
def get_file_contents(filename):
    """ 
    Given a filename,
    return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            # assumed file is a single line with key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)

In [3]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username
dbname = 'insight_db'
username = 'postgres' # change this to your username
password = get_file_contents('../keys/psql_key')

In [4]:
# 'engine' is a connection to a database
engine = create_engine('postgres://%s:%s@localhost/%s'%(username,password,dbname))
# load sql_magic so we can write SQL in Jupyter Notebooks
%load_ext sql_magic

# setup SQL connection to the postgreSQL engine we created
%config SQL.conn_name = 'engine'
print(engine.url)

postgres://postgres:postgres@localhost/insight_db


In [5]:
# connect to make queries using psycopg2
con = psycopg2.connect(database = dbname, host = 'localhost', user = username, password = password)
con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

In [6]:
# create a new cursor object
cur = con.cursor()

In [7]:
data_df = pd.read_sql('SELECT * from videos', con=con)

In [32]:
video_captions = data_df[['video_id', 'title', 'subtitles', 'description', 'primary_category']]
video_captions['title'] = video_captions['title'].str.replace('_',' ')
video_captions['primary_category'].unique()

array(['repair_wood_fence', 'carpet_flooring', 'drywall_repair',
       'build_deck', 'laminate_flooring'], dtype=object)

In [25]:
%matplotlib inline

import scattertext as st
import io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy
import pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("&lt;style>.container { width:98% !important; }&lt;/style>"))

In [26]:
nlp = spacy.load('en')

In [27]:
# turn dataframe into scattertext corpus
titles_coprus = st.CorpusFromPandas(video_captions, 
                                    category_col='primary_category', 
                                    text_col='title', nlp=nlp).build()

In [28]:
print(list(titles_coprus.get_scaled_f_scores_vs_background().index[:10]))

['mryoucandoityourself', 'bunnings', 'lifeproof', 'drywall', 'mender', 'skirtings', 'restain', 'thesarahsalvini', 'buildipedia', 'bpmi']


In [36]:
term_freq_df = titles_coprus.get_term_freq_df()

In [37]:
term_freq_df['Drywall Repair'] = titles_coprus.get_scaled_f_scores('drywall_repair')
term_freq_df['Wood Fencing'] = titles_coprus.get_scaled_f_scores('repair_wood_fence')
term_freq_df['Carpet Flooring'] = titles_coprus.get_scaled_f_scores('carpet_flooring')
term_freq_df['Deck'] = titles_coprus.get_scaled_f_scores('build_deck')
term_freq_df['Laminate Flooring'] = titles_coprus.get_scaled_f_scores('laminate_flooring')

In [42]:
print("Drywall Repair")
print(list(term_freq_df.sort_values(by='Drywall Repair', ascending=False).index[:10]))
print("\nWood Fencing")
print(list(term_freq_df.sort_values(by='Wood Fencing', ascending=False).index[:10]))
print("\nCarpet Flooring")
print(list(term_freq_df.sort_values(by='Carpet Flooring', ascending=False).index[:10]))
print("\nLaminate Flooring")
print(list(term_freq_df.sort_values(by='Laminate Flooring', ascending=False).index[:10]))
print("\nDeck")
print(list(term_freq_df.sort_values(by='Deck', ascending=False).index[:10]))

Drywall Repair
['drywall', 'drywall repair', 'texture', 'coat', 'skim', 'to patch', 'a hole', 'patch', 'hole in', 'tape']

Wood Fencing
['a fence', 'fence post', 'fence', 'wood fence', 'gate', 'post', 'posts', 'chain', 'privacy fence', 'fencing']

Carpet Flooring
['install carpet', 'carpet installation', 'carpet', 'carpet tiles', 'a carpet', 'carpet to', 'boat', 'transition', 'tiles', 'rv']

Laminate Flooring
['laminate floor', 'laminate', 'install laminate', 'installing laminate', 'laminate flooring', 'vinyl plank', 'plank', 'flooring installation', 'flooring', 'plank flooring']

Deck
['deck', 'patio', 'a deck', 'modern', 'outdoor', 'building', 'railing', 'porch', 'build', 'to build']


In [43]:
# visualize as html
html = st.produce_scattertext_explorer(titles_coprus, category='drywall_repair', 
                                       category_name='Drywall Repair', 
                                       not_category_name='Others', 
                                       width_in_pixels=1000)

In [44]:
open("Drywall-Repair-Titles-Visualization.html", 'wb').write(html.encode('utf-8'))

489519

In [None]:
## 

In [46]:
# turn dataframe into scattertext corpus
subtitles_corpus = st.CorpusFromPandas(video_captions, 
                                       category_col='primary_category', 
                                       text_col='subtitles', nlp=nlp).build()

In [47]:
print(list(subtitles_corpus.get_scaled_f_scores_vs_background().index[:10]))

['gon', 'drywall', 'joist', 'screws', 'heres', 'nt', 'plank', 'laminate', 'fence', 'planks']


In [48]:
subtitle_term_freq_df = subtitles_corpus.get_term_freq_df()

In [49]:
subtitle_term_freq_df['Drywall Repair'] = subtitles_corpus.get_scaled_f_scores('drywall_repair')
subtitle_term_freq_df['Wood Fencing'] = subtitles_corpus.get_scaled_f_scores('repair_wood_fence')
subtitle_term_freq_df['Carpet Flooring'] = subtitles_corpus.get_scaled_f_scores('carpet_flooring')
subtitle_term_freq_df['Deck'] = subtitles_corpus.get_scaled_f_scores('build_deck')
subtitle_term_freq_df['Laminate Flooring'] = subtitles_corpus.get_scaled_f_scores('laminate_flooring')

In [50]:
print("Drywall Repair")
print(list(subtitle_term_freq_df.sort_values(by='Drywall Repair', ascending=False).index[:10]))
print("\nWood Fencing")
print(list(subtitle_term_freq_df.sort_values(by='Wood Fencing', ascending=False).index[:10]))
print("\nCarpet Flooring")
print(list(subtitle_term_freq_df.sort_values(by='Carpet Flooring', ascending=False).index[:10]))
print("\nLaminate Flooring")
print(list(subtitle_term_freq_df.sort_values(by='Laminate Flooring', ascending=False).index[:10]))
print("\nDeck")
print(list(subtitle_term_freq_df.sort_values(by='Deck', ascending=False).index[:10]))

Drywall Repair
['joint compound', 'plaster', 'mud', 'drywall', 'the drywall', 'of drywall', 'compound', 'texture', 'patch', 'skim']

Wood Fencing
['fence post', 'fence', 'the fence', 'the gate', 'a fence', 'gate', 'your fence', 'pickets', 'post and', 'post']

Carpet Flooring
['carpet', 'the carpet', 'carpet and', 'your carpet', 'tack', 'stretch', 'tiles', 'boat', 'staples', 'staple']

Laminate Flooring
['laminate', 'laminate flooring', 'expansion', 'row', 'plank', 'tongue', 'the laminate', 'first row', 'planks', 'underlayment']

Deck
['the deck', 'deck', 'decking', 'your deck', 'a deck', 'patio', 'joist', 'joists', 'beam', 'ledger']


In [51]:
# visualize as html
subtitles_html = st.produce_scattertext_explorer(subtitles_corpus, category='drywall_repair',
                                                 category_name='Drywall Repair', 
                                                 not_category_name='Others', 
                                                 width_in_pixels=1000)

In [53]:
open("Drywall-Repair-SubTitles-Visualization.html", 'wb').write(subtitles_html.encode('utf-8'))

11044879