# Modeling Exercises

In [111]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from prepare import basic_clean, lemmatize, remove_stopwords, tokenize
import acquire
import seaborn as sns
from wordcloud import WordCloud
import prepare
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import math

## Spam Data

In [60]:
df = pd.read_csv('spam_clean.csv')
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [61]:
def clean(string):
    new_string = lemmatize(remove_stopwords(basic_clean(string), extra_words=['r', 'u', '2', 'ltgt',  "'", "''"]))
    return new_string

In [62]:
df.text = df.text.apply(clean)

In [63]:
df.head()

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif oni
2,spam,free entry wkly comp win fa cup final tkts 21s...
3,ham,dun say early hor c already say
4,ham,nah think go usf life around though


In [None]:
tfs = []
for i, line in enumerate(df.text):
    test = pd.Series(line.split()).value_counts(normalize=True).\
          reset_index().rename(columns={'index':'word', 0:'frequency'})
    test['message_no'] = f'{i}'
    tfs.append(test)

In [116]:
words = pd.concat(tfs)    
words.head()

Unnamed: 0,word,frequency,message_no
0,buffet,0.0625,0
1,wat,0.0625,0
2,go,0.0625,0
3,got,0.0625,0
4,la,0.0625,0


In [97]:
words = []
n_docs = []
for word in unique_words:
    occurences = 0
    for line in df.text:
        if word in line.split():
            occurences += 1
    words.append(word)
    n_docs.append(occurences)
    

In [98]:
doc_freq = pd.DataFrame({'words':words, 'n_docs':n_docs})
doc_freq.head()

Unnamed: 0,words,n_docs
0,go,285
1,jurong,1
2,point,33
3,crazy,14
4,available,16


In [114]:
doc_freq['total_docs'] = df.shape[0]
doc_freq['idf'] = doc_freq.total_docs/doc_freq.n_docs
doc_freq.idf = doc_freq.idf.apply(log)
doc_freq.head()

Unnamed: 0,words,n_docs,total_docs,idf
0,go,285,5572,2.97302
1,jurong,1,5572,8.625509
2,point,33,5572,5.129002
3,crazy,14,5572,5.986452
4,available,16,5572,5.852921


In [119]:
word_stats = words.merge(doc_freq, left_on='word', right_on='words')
word_stats.head()

Unnamed: 0,word,frequency,message_no,words,n_docs,total_docs,idf
0,buffet,0.0625,0,buffet,2,5572,7.932362
1,buffet,0.2,389,buffet,2,5572,7.932362
2,wat,0.0625,0,wat,92,5572,4.103721
3,wat,0.142857,124,wat,92,5572,4.103721
4,wat,0.333333,368,wat,92,5572,4.103721


In [21]:
documents = {
    'news': 'Codeup announced last thursday that they just launched a new data science program. It is 18 weeks long.',
    'description': 'Codeup\'s data science program teaches hands on skills using Python and pandas.',
    'context': 'Codeup\'s data science program was created in response to a percieved lack of data science talent, and growing demand.'
}

tfs = []
for doc, text in documents.items():
    # We'll make a data frame that contains the tf for every word in every document
    df = (pd.Series(text.split())
          .value_counts()
          .reset_index()
          .set_axis(['word', 'raw_count'], axis=1, inplace=False)
          .assign(tf=lambda df: df.raw_count / df.shape[0])
          .drop(columns='raw_count')
          .assign(doc=doc))
    # Then add that data frame to our list
    tfs.append(df)

In [32]:
for doc, text in documents.items():
    print(pd.Series(text.split()).value_counts().reset_index())
    

        index  0
0    launched  1
1        just  1
2      Codeup  1
3        last  1
4        that  1
5       weeks  1
6    thursday  1
7           a  1
8   announced  1
9       long.  1
10         is  1
11       they  1
12        new  1
13       data  1
14         18  1
15         It  1
16    science  1
17   program.  1
       index  0
0   Codeup's  1
1    teaches  1
2    science  1
3         on  1
4    program  1
5     Python  1
6     skills  1
7        and  1
8      using  1
9    pandas.  1
10     hands  1
11      data  1
        index  0
0        data  2
1     science  2
2           a  1
3    Codeup's  1
4     created  1
5          in  1
6     program  1
7     demand.  1
8          of  1
9    response  1
10         to  1
11       lack  1
12  percieved  1
13    talent,  1
14    growing  1
15        and  1
16        was  1
