In [None]:
import requests
import bs4
import os
import re
import unicodedata
import pandas as pd
import nltk
import prepare as prep
import explore as ex
import acquire as a
import modeling as m


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from env import user, password, host

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
categories = ['india',
'business',
'sports',
'world',
'politics',
'technology',
'startup',
'entertainment',
'miscellaneous',
'hatke',
'science',
'automobile']

df = a.get_inshorts_articles(categories)

In [None]:
df.body = df.body.apply(prep.ryans_clean)
df.head(1)

In [None]:
df.drop(columns=['date_modified', 'time_modified'], inplace=True)

In [None]:
train, validate, test = m.split(df, 'category')
train.head()

In [None]:
# Setup our X variables
X_train = train.body
X_validate = validate.body
X_test = test.body

In [None]:
# Setup our y variables
y_train = train.category
y_validate = validate.category
y_test = test.category

# TF IDF Model 

In [None]:
# Create the tfidf vectorizer object
# Step 1, this creates a tf-idf values for each word, for each document
# Step 2, encodes these values so that we can use models that only work on numbers, like classifications model
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [None]:
# Sparse vectors/matrices have tons of zeros
X_train_vectorized.todense()

In [None]:
X_train_vectorized.todense().shape

In [None]:
# Now that we have a vectorized dataset, we can use our classification tools!
lm = LogisticRegression()

# Fit the classification model on our vectorized train data
lm.fit(X_train_vectorized, y_train)

In [None]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [None]:
train.head()

In [None]:
# Use the trained model to predict y given those vectorized inputs of X
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [None]:
train.head()

In [None]:
# Train Accuracy
(train.actual == train.predicted).mean()

In [None]:
# Out of sample accuracy
(validate.actual == validate.predicted).mean()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train.actual, train.predicted))

In [None]:
category = train.actual.value_counts().index.tolist()

In [None]:
category

In [None]:
for cat in category:
    article = train[train.actual == cat]
    accuracy = (article.actual == article.predicted).mean()
    print(f"Predicting {cat} has {round(accuracy, 2)}")

In [None]:
train.actual.value_counts(), train.shape

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.body)
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))


# More models

In [None]:
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, weights='distance')

knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

In [None]:

print('Accuracy of KNN classifier on training set n_neighbors set to 5: {:.2f}'
     .format(knn.score(X_train, y_train)))
print(f'The confusion matrix:\n {confusion_matrix(y_train, y_pred)}\n')
print(f'Classificaiton Report:\n{classification_report(y_train, y_pred)}\n')
# print(f'The True Positive Rate is: {tpr:.2%}')
# print(f'The False Positive Rate is: {fpr:.2%}')
# print(f'The True Negative Rate is: {tnr:.2%}')
# print(f'The False Negative Rate is: {fnr:.2%}')


In [None]:
print('Accuracy of KNN classifier on training set n_neighbors set to 5: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of KNN classifier on test set with n_neighbors set to 5: {:.2f}\n'
     .format(knn.score(X_validate_vectorized, y_validate)))