# Exploring Scattertext library capabilities

Here's the documentation for what I'm attempting to adapt: https://github.com/JasonKessler/scattertext

In [1]:
# !pip install scattertext

In [2]:
# !pip install spacy

In [3]:
# !python -m spacy download en_core_web_sm

In [1]:
import scattertext as st
import pandas as pd
import numpy as np
# from pprint import pprint
import spacy
import en_core_web_sm

In [11]:
tweet_csv_files = ['tweets_lovehatewords_2018.csv',
                   'tweets_happysad_2018.csv',
                   'tweets_musicwords_2018.csv',
                   'tweets_moneywords_2018.csv',
                   'tweets_nowords_2018.csv',
                   'tweets_politicwords_2018.csv',
                   'tweets_codingwords_2018.csv']

In [12]:
query_shorthand = 'musicwords'
filename = f'tweets_{query_shorthand}_2018.csv'
# reads in the CSV file as a DataFrame
df = pd.read_csv(filename)
df.timestamp = pd.to_datetime(df.timestamp, format='%Y%m%d')  
df = df[df['sentiment'].isin(['positive','negative'])]

In [13]:
df.head()

Unnamed: 0,timestamp,text,sentiment,polarity,subjectivity,tally
2,2018-01-01,Starting 2018 taking my time and doing it righ...,positive,0.142857,0.317857,1
3,2018-01-01,WHRW In Between - Top of the Hour Music for Y...,positive,0.5,0.5,1
4,2018-01-01,YASSS It's time for a great show Studiomix #C...,positive,0.3,0.525,1
5,2018-01-01,Free music from DDJ's Productions for the next...,positive,0.274091,0.650909,1
7,2018-01-01,Preciate The Love I just wanted to try somethi...,positive,0.25,0.6,1


In [17]:
break       #### comment out this line if you want to run this block
####  NOTE: This block takes about 1-2 hours to execute  ####

# Turn the data frame into a Scattertext Corpus to begin analyzing it. 
nlp = en_core_web_sm.load()
corpus = st.CorpusFromPandas(data_frame=df,
                             category_col='sentiment',
                             text_col='text',
                             nlp=nlp).build()

In [18]:
# Here are the terms that differentiate the corpus from a general English corpus.
list(corpus.get_scaled_f_scores_vs_background().index[:10])

['liked',
 'nowplaying',
 'dance',
 'spotify',
 'hiphop',
 'youtube',
 'gon',
 'music',
 'bts',
 'tunes']

In [19]:
# Here are the terms that are most associated with positivity:

term_freq_df = corpus.get_term_freq_df()
term_freq_df['Positivity_Score'] = corpus.get_scaled_f_scores('positive')
list(term_freq_df.sort_values(by='Positivity_Score', ascending=False).index[:10])

['incredible',
 'the best',
 'your favorite',
 'best music',
 'great music',
 'beautiful',
 'best',
 'wonderful',
 'great show',
 'awesome']

In [20]:
# ... and here are the terms that are most associated with negativity:

term_freq_df = corpus.get_term_freq_df()
term_freq_df['Negativity_Score'] = corpus.get_scaled_f_scores('negative')
list(term_freq_df.sort_values(by='Negativity_Score', ascending=False).index[:10])

['terrible',
 'horrible',
 'the worst',
 'i hate',
 'to sad',
 'awful',
 'worst',
 'annoying',
 'sad music',
 'insane']

In [21]:
# Create an HTML page for the interactive visualization
html = st.produce_scattertext_explorer(corpus,
    category='positive',
    category_name='Positive',
    not_category_name='Negative',
    # metadata=df['speaker'],
    minimum_term_frequency=25,               ### good value to allow page to load in <2 minutes
    minimum_not_category_term_frequency=25,  ### good value to allow page to load in <2 minutes
#     max_terms=5000,      ### needs to be much higher or else it's a weird graph
    max_snippets=50,
    show_characteristic=True,
    width_in_pixels=1000)
open(f"Tweet_{query_shorthand}_Visualization.html", 'wb').write(html.encode('utf-8'))

30114518

### Saving the object for use in the next session

In [22]:
break   ### comment out this line to run code

# Saving the `corpus` object for later use, so i don't have to build it again
import pickle 
pickle.dump(corpus, open(f"scattertext_{query_shorthand}_corpus.obj", "wb" ))

### Loading the object back in for later use

In [None]:
break   ### comment out this line to run code

# when I open this notebook again, I can run this code to reload the object
import pickle
corpus2 = pickle.load(open(f'scattertext_{query_shorthand}_corpus.obj', 'rb'))