# Data Exploration Exercises

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mason_functions as mf
import nltk
import unicodedata
import re
import acquire
import prepare

from nltk import sentiment
from wordcloud import WordCloud

Do your work for this exercise in a file named explore.

## Exercise I
Spam Data

    a. Load the spam data set.
    b. Create and explore bigrams for the spam data. Visualize them with a word cloud. How do they compare with the ham bigrams?
    c. Is there any overlap in the bigrams for the spam data and the ham data?
    d. Create and explore with trigrams (i.e. a n-gram with an n of 3) for both the spam and ham data.

### a. 
Load the spam data set.

In [2]:
# load data
df = pd.read_csv('spam_clean.csv')

# preview
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# get spam info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
# get values and counts for the label column
df.label.value_counts(), df.label.value_counts(normalize = True)

(ham     4825
 spam     747
 Name: label, dtype: int64,
 ham     0.865937
 spam    0.134063
 Name: label, dtype: float64)

In [5]:
# combine all messages in single string by label
ham_words = prepare.basic_clean(' '.join(df[df.label == 'ham'].text))
spam_words = prepare.basic_clean(' '.join(df[df.label == 'spam'].text))
all_words = prepare.basic_clean(' '.join(df.text))

In [6]:
# get frequencies
ham_freq = pd.Series(ham_words.split()).value_counts()
spam_freq = pd.Series(spam_words.split()).value_counts()
all_freq = pd.Series(all_words.split()).value_counts()

In [7]:
# check word frequencies in ham data
ham_freq

i           2188
you         1837
to          1554
the         1118
a           1055
            ... 
overdid        1
bcaz           1
stunning       1
boutxx         1
tas            1
Length: 7560, dtype: int64

In [8]:
# check word frequencies in spam data
spam_freq

to          686
a           376
call        347
you         287
your        263
           ... 
request       1
hmv1          1
woods         1
3xa150pw      1
pole          1
Length: 3033, dtype: int64

In [9]:
# check word frequencies overall
all_freq

to            2240
i             2233
you           2124
a             1431
the           1322
              ... 
lennon           1
blackim          1
dearme           1
someones         1
hesitation       1
Length: 9562, dtype: int64

In [10]:
# get normalized frequencies
ham_nfreq = pd.Series(ham_words.split()).value_counts(normalize = True)
spam_nfreq = pd.Series(spam_words.split()).value_counts(normalize = True)
all_nfreq = pd.Series(all_words.split()).value_counts(normalize = True)

In [11]:
ham_nfreq

i           0.032405
you         0.027206
to          0.023015
the         0.016558
a           0.015625
              ...   
overdid     0.000015
bcaz        0.000015
stunning    0.000015
boutxx      0.000015
tas         0.000015
Length: 7560, dtype: float64

In [12]:
spam_nfreq

to          0.038991
a           0.021371
call        0.019723
you         0.016312
your        0.014948
              ...   
request     0.000057
hmv1        0.000057
woods       0.000057
3xa150pw    0.000057
pole        0.000057
Length: 3033, dtype: float64

In [13]:
all_nfreq

to            0.026317
i             0.026235
you           0.024954
a             0.016813
the           0.015532
                ...   
lennon        0.000012
blackim       0.000012
dearme        0.000012
someones      0.000012
hesitation    0.000012
Length: 9562, dtype: float64

In [14]:
# concat all frequencies together into a dataframe
word_counts = pd.concat([ham_freq, spam_freq, all_freq], axis=1).fillna(0).astype(int)
word_counts.columns = ['ham', 'spam', 'all']
word_counts.head()

Unnamed: 0,ham,spam,all
i,2188,45,2233
you,1837,287,2124
to,1554,686,2240
the,1118,204,1322
a,1055,376,1431


In [15]:
word_counts.sort_values('all', ascending = False).head(20)

Unnamed: 0,ham,spam,all
to,1554,686,2240
i,2188,45,2233
you,1837,287,2124
a,1055,376,1431
the,1118,204,1322
u,972,147,1119
and,848,122,970
is,728,158,886
in,811,73,884
me,756,30,786


In [16]:
word_counts.sort_values(['ham', 'spam'], ascending = [True, False]).head(50)

Unnamed: 0,ham,spam,all
claim,0,113,113
prize,0,92,92
won,0,73,73
guaranteed,0,50,50
tone,0,48,48
18,0,43,43
awarded,0,38,38
a1000,0,35,35
150ppm,0,34,34
a2000,0,31,31


### b. 
Create and explore bigrams for the spam data. Visualize them with a word cloud. How do they compare with the ham bigrams?

In [17]:
spam_top_20_bigrams = pd.Series(nltk.bigrams(spam_words.split())).value_counts().head(20)
spam_top_20_bigrams

(you, have)           73
(have, won)           54
(your, mobile)        49
(to, claim)           46
(please, call)        44
(this, is)            40
(won, a)              40
(to, contact)         37
(you, are)            35
(stop, to)            28
(u, have)             27
(cash, or)            27
(will, be)            25
(a, a2000)            25
(or, a)               25
(po, box)             24
(contact, u)          24
(to, receive)         23
(a1000, cash)         23
(guaranteed, call)    23
dtype: int64

## Exercise II
Explore the blog articles using the techniques discussed in the exploration lesson.

## Exericse III
Explore the news articles using the techniques discussed in the exploration lesson. Use the category variable when exploring.