# Visualization

This notebook will contain my visuals.

In [None]:
#imports
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import random
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread
import imageio

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [None]:
# Read the data in

sub_list = ['mc','fb','mls','ssfc','tfr','tfys', 'bar']

for sub in sub_list:
    if sub == 'mc':
        csvfilename = '../datasets/motorcycles.csv'
    elif sub == 'fb':
        csvfilename = '../datasets/soccer.csv'
    elif sub == 'mls':
        csvfilename = '../datasets/mls.csv'
    elif sub == 'ssfc':
        csvfilename = '../datasets/sounders.csv'
    elif sub == 'tfr':
        csvfilename = '../datasets/tfr.csv'
    elif sub == 'bar':
        csvfilename = '../datasets/bar.csv'
    else:
        csvfilename = '../datasets/tfys.csv'
        
# Read the CSV in, drop the Unnamed column, drop the first index, which is blank, reset the index
    df = pd.read_csv(csvfilename)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    df.drop(index=0, inplace=True)
    df.reset_index(drop=True, inplace=True)

# lowercase everything but the author
    df['title'] = df.title.str.lower()
    df['text'] = df.text.str.lower()
    
# if the text, the actual post, is Nan, replace with spaces. Spaces won't affect the process
    df['text'].fillna(' ', inplace=True)
        
        
# put the data into the correct dataframe, one per sub, so facilitate analysis
    if sub == 'mc':
        mc_df = df
    elif sub == 'fb':
        fb_df = df
    elif sub == 'mls':
        mls_df = df
    elif sub == 'tfr':
        tfr_df = df
    elif sub == 'tfys':
        tfys_df = df
    elif sub == 'bar':
        bar_df = df
    else:
        ssfc_df = df

In [None]:
# This function will take in a dirty pd series and return a clean corpus with no numbers or punctuation

def string_clean(series):

# init the corpus
    corpus = []
    
    
# Step throught the series
    for i, val in series.iteritems():
        
# Create a list of all the clean words/tokens
        clean_list = re.findall(r'\b[^\d\W]+\b', val)

# Create the clean string with the tokens separated by spaces
        s_clean = ' '.join(clean_list)
    
# Append to the corpus
        corpus.append(s_clean)
    
    return corpus

In [None]:
# Get a list of the vocabularies of the sent corpora. Returns three lists. One each of all the words 
# in the given corpus and a list of the words that exist in both.
# The other point of the function is to use the word corpora.

def vocab_lists(corp1, corp2):
    
    X_1 = string_clean(corp1)
    X_2 = string_clean(corp2)
    
    cvec = CountVectorizer(stop_words='english', max_features=8000)
    
    cvec.fit_transform(X_1)
    vocab1 = cvec.get_feature_names()

    cvec.fit_transform(X_2)
    vocab2 = cvec.get_feature_names()
    
    join = []
    for word in vocab1:
        if word in vocab2:
            join.append(word)
            

    
    return vocab1, vocab2, join

### Vocabulary size and joint vocab size

In [None]:
mc = mc_df['title']
fb = fb_df['title']

mc_v, fb_v, join = vocab_lists(mc,fb)
print(f'mc len: {len(mc_v)}')
print(f'fb len: {len(fb_v)}')
print(f'joint len: {len(join)}')
print(f'mc total len: {mc_df.shape[0]}')
print(f'fb total len: {fb_df.shape[0]}')

In [None]:
mls = mls_df['title']
fb = fb_df['title']

mls_v, fb_v, join = vocab_lists(mls,fb)
print(f'mls len: {len(mls_v)}')
print(f'fb len: {len(fb_v)}')
print(f'joint len: {len(join)}')
print(f'fb total len: {fb_df.shape[0]}')
print(f'mls total len: {mls_df.shape[0]}')

In [None]:
mls = mls_df['title']
ssfc = ssfc_df['title']

mls_v, ssfc_v, join = vocab_lists(mls,ssfc)
print(f'mls len: {len(mls_v)}')
print(f'ssfc len: {len(ssfc_v)}')
print(f'joint len: {len(join)}')
print(f'mls total len: {mls_df.shape[0]}')
print(f'ssfc total len: {ssfc_df.shape[0]}')

In [None]:
tfr = tfr_df['title']
tfys = tfys_df['title']

tfr_v, tfys_v, join = vocab_lists(tfr,tfys)
print(f'tfr len: {len(tfr_v)}')
print(f'tfys len: {len(tfys_v)}')
print(f'joint len: {len(join)}')
print(f'tfr total len: {tfr_df.shape[0]}')
print(f'tfys total len: {tfys_df.shape[0]}')

In [None]:
tfys = tfys_df['title']
bar = bar_df['title']

tfr_v, bar_v, join = vocab_lists(tfr,bar)
print(f'tfys len: {len(tfys_v)}')
print(f'bar len: {len(bar_v)}')
print(f'joint len: {len(join)}')
print(f'tfys total len: {tfys_df.shape[0]}')
print(f'bar total len: {bar_df.shape[0]}')

In [None]:
tfys = pd.Series(tfys_df['title'] + ' ' + tfys_df['text'])
bar = pd.Series(bar_df['title'] + ' ' + bar_df['text'])

tfr_v, bar_v, join = vocab_lists(tfr,bar)
print(f'tfys len: {len(tfys_v)}')
print(f'bar len: {len(bar_v)}')
print(f'joint len: {len(join)}')
print(f'tfys total len: {tfys_df.shape[0]}')
print(f'bar total len: {bar_df.shape[0]}')

### Word clouds of happiness

In [None]:
# Shamelessly stolen from Mr. Charles Rice.
# Setting up stopwords for future use
stop = stopwords.words('english')
punct = {'"', "'", '.', ',', '-', '--', '!', ';', '?', ':', '(', ')', '``', "''", '``'}
stop = {x.lower() for x in stop}
stop = stop|punct

In [None]:
for word in mc_v:
    mc_str = ' '.join(mc_v)

In [None]:
# Also from Charlie. Thanks!
# Motorcycle Word Cloud
wc = WordCloud(width=800, height=400, stopwords=stop)

wc.generate_from_text(mc_str)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis('off')

plt.savefig('../assets/motorcycle_wc.jpg', quality=95, dpi = 250)
plt.figure()

In [None]:
for word in fb_v:
    fb_str = ' '.join(fb_v)

In [None]:
# Also from Charlie. Thanks!
# Football (soccer for you Americans) Word Cloud
wc = WordCloud(width=800, height=400, stopwords=stop)

wc.generate_from_text(fb_str)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis('off')

plt.savefig('../assets/football_wc.jpg', quality=95, dpi=250)
plt.figure()

### Word cloud with a mask

In [None]:
bike_mask = imageio.imread('../assets/motorcycle_mask.jpg')

In [None]:
wc = WordCloud(background_color='black', stopwords=STOPWORDS, mask=bike_mask, collocations=False)

wc.generate_from_text(mc_str)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis('off')
plt.savefig('../assets/motorcyclemask_wc.jpg', quality=95, dpi=250)
plt.figure()

In [None]:
ball_mask = imageio.imread('../assets/football_mask.jpg')

In [None]:
wc = WordCloud(background_color='black', mode='RGBA', stopwords=STOPWORDS, mask=ball_mask, collocations=False)

wc.generate_from_text(fb_str)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis('off')
plt.savefig('../assets/footballmask_wc.jpg', quality=95, dpi=250)
plt.figure()