In [45]:
import pandas as pd
import numpy as np

In [2]:
import acquire
import prepare

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [11]:
df = pd.read_csv('spam_clean.csv')

In [12]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression()
lm.fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)



In [15]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.44%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3857   112
spam          2   486
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       1.00      0.81      0.90       598

   micro avg       0.97      0.97      0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.98      0.97      0.97      4457



In [27]:
news = acquire.get_news_articles()

In [35]:
def clean(text):
    return prepare.remove_stopwords(prepare.lemmatize(prepare.basic_clean(text)))

In [36]:
df = pd.DataFrame(news)

In [37]:
df.content = df.content.apply(clean)

In [38]:
df.title = df.title.apply(clean)

In [39]:
df.head()

Unnamed: 0,category,content,title
0,business,world fourth richest person warren buffett cla...,4th richest man buffett say tesla cant sell in...
1,business,gujarat based diamond trader savji dholakia kn...,guj trader gifted car staff remove illegal roa...
2,business,world fourth richest person warren buffett ha ...,ill never hesitate fly 737 max buffett crash k...
3,business,activist hyderabad claimed infosys deducting p...,infosys make hyd staff pay parking activist ca...
4,business,indigo talk airbus add new plane including lon...,indigo talking airbus buy yet released a321 xl...


In [41]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.content)
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression()
lm.fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)



In [42]:
def get_confusion_metrics(cnf):
    FP = cnf.sum(axis=0) - np.diag(cnf)  
    FN = (cnf.sum(axis=1)) - np.diag(cnf)
    TP = np.diag(cnf)
    TN = cnf.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    print(f'Recall: {TPR}')
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    print(f'True Negative Rate: {TNR}')
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    print(f'Precision: {PPV}')
    # Negative predictive value
    NPV = TN/(TN+FN)
    print(f'Negative Predictive Value: {NPV}')
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    print(f'False positive Rate: {FPR}')
    # False negative rate
    FNR = FN/(TP+FN)
    print(f'False Negative Rate: {FNR}')
    # False discovery rate
    FDR = FP/(TP+FP)
    print(f'False Discovery Rate: {FDR}')
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    print(f'Overall Accuracy: {ACC}')

In [43]:
cnf = pd.crosstab(train.predicted, train.actual)

In [46]:
get_confusion_metrics(cnf)

Recall: predicted
business         1.000000
entertainment    1.000000
sports           1.000000
technology       0.952381
dtype: float64
True Negative Rate: actual
business         0.0
entertainment    NaN
sports           NaN
technology       1.0
dtype: float64
Precision: actual
business         0.95
entertainment    1.00
sports           1.00
technology       1.00
dtype: float64
Negative Predictive Value: actual
business         NaN
entertainment    NaN
sports           NaN
technology      -inf
dtype: float64
False positive Rate: actual
business         1.0
entertainment    NaN
sports           NaN
technology      -0.0
dtype: float64
False Negative Rate: predicted
business         0.000000
entertainment    0.000000
sports           0.000000
technology       0.047619
dtype: float64
False Discovery Rate: actual
business         0.05
entertainment    0.00
sports           0.00
technology       0.00
dtype: float64
Overall Accuracy: actual
business         0.95
entertainment    1.00
sport

In [47]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 98.72%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business             19              0       0           0
entertainment         0             19       0           0
sports                0              0      19           0
technology            1              0       0          20
---
               precision    recall  f1-score   support

     business       1.00      0.95      0.97        20
entertainment       1.00      1.00      1.00        19
       sports       1.00      1.00      1.00        19
   technology       0.95      1.00      0.98        20

    micro avg       0.99      0.99      0.99        78
    macro avg       0.99      0.99      0.99        78
 weighted avg       0.99      0.99      0.99        78

