In [1]:
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from prepare import basic_clean, lemmatize

import re
import unicodedata
import pandas as pd
import nltk

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import classification_report

import acquire as a 
import prepare as p

In [2]:
categories = ["business", "sports", "technology", "entertainment"]
news_df = a.get_all_news_articles(categories)

In [3]:
news_df = p.full_df(news_df, "title")
news_df.head(2)

Unnamed: 0,original,category,clean,stemmed,lemmatized
0,Several companies in India have been offering ...,business,several companies india offering extra holiday...,sever compani india offer extra holiday employ...,several company india offering extra holiday e...
1,"Bill Gates' Cascade Investment, a holding comp...",business,bill gates ' cascade investment holding compan...,bill gate ' cascad invest hold compani gate cr...,bill gate ' cascade investment holding company...


In [4]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt', "'"]

def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [5]:
# transform into a series

string = " ".join(news_df.lemmatized)

string = " ".join(clean(string))

In [6]:
words = pd.Series(string.split())

# From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
said,69,0.019387,1.000000
added,31,0.008710,0.449275
covid19,30,0.008429,0.434783
india,27,0.007586,0.391304
company,23,0.006462,0.333333
...,...,...,...
indiahas,1,0.000281,0.014493
lining,1,0.000281,0.014493
cain,1,0.000281,0.014493
future,1,0.000281,0.014493


In [7]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [8]:
train, validate, test = split(news_df, 'category')

In [9]:
train.head(2)

Unnamed: 0,original,category,clean,stemmed,lemmatized
15,US lawmaker Pramila Jayapal responded to Tesla...,business,us lawmaker pramila jayapal responded tesla ce...,us lawmak pramila jayap respond tesla ceo worl...,u lawmaker pramila jayapal responded tesla ceo...
4,American biotechnology company Moderna has rai...,business,american biotechnology company moderna raised ...,american biotechnolog compani moderna rais 202...,american biotechnology company moderna raised ...


In [10]:
# Setup our X variables
X_train = train.lemmatized
X_validate = validate.lemmatized
X_test = test.lemmatized

In [11]:
# Setup our y variables
y_train = train.category
y_validate = validate.category
y_test = test.category

In [12]:
X_train.head()

15    u lawmaker pramila jayapal responded tesla ceo...
4     american biotechnology company moderna raised ...
20    fitch solution said resurgence covid19 case ex...
57    dogecoin soared around 14000 far year surpasse...
97    actor ajay devgn producer anand pandit set ope...
Name: lemmatized, dtype: object

In [13]:
# Create the tfidf vectorizer object
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [14]:
# now that it's vectorized, we canuse classification tools

lm = LogisticRegression().fit(X_train_vectorized, y_train)

lm.fit(X_train_vectorized, y_train)

LogisticRegression()

In [15]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [16]:
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [17]:
train.head()

Unnamed: 0,actual,predicted
15,business,business
4,business,business
20,business,business
57,technology,technology
97,entertainment,entertainment


In [18]:
# Train Accuracy
(train.actual == train.predicted).mean()

0.9821428571428571

In [19]:
(validate.actual == validate.predicted).mean()

0.7083333333333334

In [20]:
lines = pd.Series([
    "Speaking about India's second COVID-19 wave", 
    "marvel video titled marvel studio celebrates"
])


lines = tfidf.transform(lines)


In [21]:
lm.predict(lines)

array(['business', 'sports'], dtype=object)

In [22]:
 print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.93      1.00      0.97        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      0.93      0.96        14

     accuracy                           0.98        56
    macro avg       0.98      0.98      0.98        56
 weighted avg       0.98      0.98      0.98        56



___
# Ham VS Spam

In [23]:
sql = "SELECT * FROM spam"

df = a.get_data(sql, "spam_db").set_index('id')

In [24]:
df.head(2)

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [25]:
# transform into a series

string = " ".join(df.text)

string = " ".join(clean(string))

In [26]:
words = pd.Series(string.split())

# From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
call,600,0.011927,1.000000
im,462,0.009184,0.770000
get,397,0.007892,0.661667
ur,384,0.007633,0.640000
go,304,0.006043,0.506667
...,...,...,...
denying,1,0.000020,0.001667
box334,1,0.000020,0.001667
08702840625,1,0.000020,0.001667
6cruel,1,0.000020,0.001667


In [28]:
df = p.prep_article_data(df, "text", extra_words = ['r', 'u', '2', 'ltgt', "'"], keeper_col="label")