# Curriculum 

In [1]:
import re
import unicodedata
import pandas as pd
import nltk

ADDITIONAL_STOPWORDS = ["r", "u", "2", "ltgt"] # doesn't read these IN ADDITION TO not reading standard stopwords

def clean(text):
    "A simple function to cleanup text data"
    
    # assigns the WordNetLemmatizer object ('wnl')
    wnl = nltk.stem.WordNetLemmatizer()
    
    # declares which are stopwords
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    
    text = (unicodedata.normalize("NFKD", text)
           .encode('ascii', 'ignore')
           .decode('utf-8', 'ignore')
           .lower())
    
    words = re.sub(r'[^\w\s]', '', text).split() # removes non-alphanumerics
    
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

df = pd.read_csv("spam_clean.csv")

df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**^^ 'ham' = good text; 'spam' = garbage text**

### First, we take a look at the number of spam vs. the number of ham texts

In [2]:
labels = pd.concat([df.label.value_counts(), df.label.value_counts(normalize=True)], axis=1)

labels.columns = ["n", "percent"]

labels

Unnamed: 0,n,percent
ham,4825,0.865937
spam,747,0.134063


**^^ 87% of the texts are from people we know; 13% are from people we don't**

### Now we'll break the data into three different pieces: 

1.) the words that appear in legitimate text messages;

2.) the words that appear in spam text messages; and 

3.) All of the words

In [None]:
ham_words = clean(' '.join(df[df.label == 'ham'].text))

spam_words = clean(' '.join(df[df.label == 'spam'].text))

all_words = clean(' '.join(df.text))

### This gives us a list of words.  Now we can transform them into a pandas series which we can use to show how often each word occurs:


In [None]:
ham_freq = pd.Series(ham_words).value_counts()

spam_freq = pd.Series(spam_words).value_counts()

all_freq = pd.Series(all_words).value_counts()

In [None]:
ham_freq.head() # shows us the most common words (in descending order) found in ham texts

In [None]:
spam_freq.head() # shows us the most common words (in descending order) found in spam texts

In [None]:
all_freq.head() # # shows us the most common words (in descending order) found in all the texts

### Now combine all those frequencies into one resulting dataframe that we can work with:

In [None]:
word_counts = (pd.concat([all_freq, ham_freq, spam_freq], axis=1, sort=True)
               .set_axis(["all", "ham", "spam"], axis=1, inplace=False)
               .fillna(0).apply(lambda s: s.astype(int)))

word_counts.head()
               
               

### What are the most frequently occuring words?

In [None]:
word_counts.sort_values(by='all', ascending=False).head(10)

**^^ 'Call' is seen 600 times, 241 times in approved texts, but 359 times in spam.  And on down the line.**

### Are theer any words that uniquely identify a spam or ham message?

In [None]:
pd.concat([word_counts[word_counts.spam == 0].sort_values(by="ham").tail(6),
          word_counts[word_counts.ham == 0].sort_values(by="spam").tail(6)])

**^^Looks like 'awarded,' '18,' 'guaranteed,' 'tone,' 'prize,' and 'claim' are all unique identifiers of spam, wherease the rest of the words are unique identifiers for ham.**

### Visualization

- I REALLY GOTTA GET BETTER AT THIS.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

#### To find out the percentage of 'spam' versus 'ham:'

In [None]:
(word_counts.assign(p_spam=word_counts.spam / word_counts['all'],
                   p_ham=word_counts.ham / word_counts['all']).sort_values(by="all")[['p_spam', 'p_ham']]
                   .tail(20).sort_values('p_ham').plot.barh(stacked=True))
plt.title("Proportion of Spam vs. Ham for the 20 most common words")

**^^Nearly 80% of texts containing 'free' are spam, whereas NEXT TO NO SPAM actually cares if the person is 'ill.''**

In [None]:
(word_counts[(word_counts.spam > 10) & (word_counts.ham > 10)].assign(ratio=lambda df: df.spam/ (df.ham +.01))
 .sort_values(by="ratio").pipe(lambda df: pd.concat([df.head(), df.tail()])))

# basically, I'm saying make a df called 'word_counts' where the word count of 'spam' is more than (>) 10, and
# the word count of 'ham' is more than (>) 10 and assign them a ratio of spam / ham +1% (.01 above).  Then,
# take that ratio and put them into a 'pipe df.'  'Pipe' is used because we're chaining together functions and 
# anticipating a df or Series in return.

### Word Clouds

pip installed the following on my terminal: **python -m pip install --upgrade wordcloud**

- Wordclouds allow you to id the relative frequency of different keywords using an easily digestible visual

- The larger the word appears, the more frequent its appearance in the data

In [None]:
from wordcloud import WordCloud

sentence = "Mary had a little lamb, little lamb, little lamb.  Its fleece was white as snow."

img = WordCloud(background_color="white").generate(sentence) # produces a WordCloud image object
plt.imshow(img) # displays the WordCloud image object
plt.axis("off") # turned off b/c axes aren't of any use in wordclouds

### So using a wordcloud for our 'Spam V. Ham' debate:

In [None]:
all_cloud = WordCloud(background_color="white", height=1000, width=400).generate(' '.join(all_words))

ham_cloud = WordCloud(background_color="white", height=600, width=800).generate(' '.join(ham_words))

spam_cloud = WordCloud(background_color="white", height=600, width=800).generate(' '.join(spam_words))

plt.figure(figsize=(10, 8))
axs = [plt.axes([0, 0, .5, 1]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, 0, .5, .5])]


axs[0].imshow(all_cloud)
axs[1].imshow(ham_cloud)
axs[2].imshow(spam_cloud)

axs[0].set_title("All Words")
axs[1].set_title("Ham Words")
axs[2].set_title("Spam Words")

for ax in axs: ax.axis("off")

### Bigrams:

- a way to combine two words together to measure the frequency an actual phrase appears

- usles nltk

In [None]:
sentence = "Mary had a little lamb"

bigrams = nltk.ngrams(sentence.split(), 2)
list(bigrams)

**Now find out which bigrams are the most frequently occurring:**

In [None]:
top_20_ham_bigrams = (pd.Series(nltk.ngrams(ham_words, 2)).value_counts().head(20))

top_20_ham_bigrams

In [None]:
top_20_ham_bigrams.sort_values().plot.barh(color="pink", width=.9, figsize=(10, 6))

plt.title('20 Most Frequently Occuring Ham Bigrams')

plt.ylabel("Bigram")

plt.xlabel("# Occurences")

# make the labels better looking

ticks, _ = plt.yticks()
labels = top_20_ham_bigrams.reset_index()["index"].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)

**Bigrams can be used to make a WordCloud as well:**

- we supply our own values to be used to determine how big the words (or phrases) should be by using the 'generate_from_frequencies' method

- the values we supply must be in the form of a dictionary where the keys are the words/phrases and the values are their corresponding numbers



1.) Convert series to dictionary, and the tuples that make up the index into a single string that holds each phrase

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_ham_bigrams.to_dict().items()} # basically k:v for k, v

img = WordCloud(background_color="white", width=800, height=400).generate_from_frequencies(data)

plt.figure(figsize=(8, 4))

plt.imshow(img)

plt.axis("off")


# Exercises:

### 1.) Spam Data:

        a.) Load the spam dataset
    
        b.) Create and explore bigrams for the spam data.  Visualize them with a word cloud.  How do they compare against the ham bigrams?
        
        c.) Is there any overlap in the bigrams for the spam data and the ham data?
        
        d.) Create and explore with trigrams (three-word phrases, or, n-grams with an n of 3) for both the spam and the ham data

In [None]:
# load the spam dataset:

df = pd.read_csv("spam_clean.csv")

df.head()

In [None]:
# Create and explore bigrams for spam:

top_20_spam_bigrams = (pd.Series(nltk.ngrams(spam_words, 2)).value_counts().head(20))

top_20_spam_bigrams

In [None]:
# Visualize the Spam Bigrams, then the Ham Bigrams.  Any overlap?

data = {k[0] + ' ' + k[1]: v for k, v in top_20_spam_bigrams.to_dict().items()} # basically k:v for k, v

img = WordCloud(background_color="white", width=800, height=400).generate_from_frequencies(data)

plt.figure(figsize=(8, 4))

plt.imshow(img)

plt.title("Spam_Bigram")

plt.axis("off")

data = {k[0] + ' ' + k[1]: v for k, v in top_20_ham_bigrams.to_dict().items()} # basically k:v for k, v

img = WordCloud(background_color="white", width=800, height=400).generate_from_frequencies(data)

plt.figure(figsize=(8, 4))

plt.imshow(img)

plt.title("Ham_Bigram")

plt.axis("off")



In [None]:
# Create and explore trigrams for spam:

top_20_spam_trigrams = (pd.Series(nltk.ngrams(spam_words, 3)).value_counts().head(20))

top_20_spam_trigrams

In [None]:
# explore trigrams for ham words:

top_20_ham_trigrams = (pd.Series(nltk.ngrams(ham_words, 3)).value_counts().head(20))

top_20_ham_trigrams

### 2.) Explore the blog articles using the techniques discussed in the exploration lesson.

In [None]:
import acquire.py