# News Classification Model

### Step 1. Install Packages

In [1]:
# install all packages needed

#!pip3 install sklearn --upgrade
#!pip3 install pickle --update

In [1]:
# import all libraries needed

import pandas as pd
from pandas import read_csv
import os
import spacy
import numpy as np
import matplotlib.pyplot as plt
import langid

# nltk used for parsing and cleaning text
import nltk
import unicodedata
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
from scipy import spatial
from itertools import combinations

# used to acccess the sql database
import pymysql
# library that helps turn dataframes into sql tables
from sqlalchemy import create_engine

'''# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models'''


import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression

import pickle

## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arminberger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arminberger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
current_dir = os.getcwd()
current_dir 

'/Users/arminberger/Documents/GitHub/iteration_1_model'

### Step 2. Read in Data from MySQl database

In [3]:
# create connection
connection = pymysql.connect(host='news-data-rdb.cqsnaejqwcpu.ap-southeast-2.rds.amazonaws.com',
                             user='admin',
                             password='badpassword1',
                             db='NewsData')

In [4]:
# SQL query to retreive our data for the model
SQL_Query = pd.read_sql_query(
    '''select
        *
        from labeled_news_data''', connection)

In [5]:
# save the data rereived in a query as a pandas dataframe
news_sql_df = pd.DataFrame(SQL_Query, columns=['news_id', 'text', 'label'])

In [6]:
# show the result
news_sql_df

Unnamed: 0,news_id,text,label
0,0,house dem aide didnt even see comeys letter ja...,1
1,1,ever get feeling life circle roundabout rather...,0
2,2,truth might get fired october tension intellig...,1
3,3,video civilian killed single u airstrike ident...,1
4,4,print iranian woman sentenced six year prison ...,1
...,...,...,...
19871,19876,rapper unloaded black celebrity met donald tru...,0
19872,19877,green bay packer lost washington redskin week ...,0
19873,19878,macys today grew union several great name amer...,0
19874,19879,nato russia hold parallel exercise balkan 11 0...,1


### Step 3. Vectorize text

In [7]:
x_train_text = news_sql_df.text.to_list()

In [8]:
vectorizer = TfidfVectorizer(analyzer = 'word',
                             input = 'content',
                            lowercase = True,
                            token_pattern = '(?u)\\b\\w\\w+\\b',
                            min_df = 3,
                            ngram_range = (1,2))

In [9]:
# vectorize the training text input 
x_train = vectorizer.fit_transform(x_train_text)

In [10]:
# save the target data as a list
y_train = news_sql_df.label.to_list()

In [11]:
# save the model to disk
filename = 'vectorizer.pk'
pickle.dump(vectorizer, open(filename, 'wb'))

In [12]:
# statistical model used in this assignement
model = LogisticRegression()

In [13]:
# fit the LogisticRegression using our training data
model = model.fit(x_train, y_train)

In [14]:
# save the model to disk
filename = 'basic_news_logistic_regression.sav'
pickle.dump(model, open(filename, 'wb'))

### OPTIONAL TEST - Step 4. Predict the news

In [16]:
'''user_news_input = input('Paste your text here: ')
user_news_input_processed = futher_process_string(user_news_input)
user_news_input_processed = ' '.join(user_news_input_processed)
user_news_input_vec = vectorizer.transform([user_news_input_processed])'''

"user_news_input = input('Paste your text here: ')\nuser_news_input_processed = futher_process_string(user_news_input)\nuser_news_input_processed = ' '.join(user_news_input_processed)\nuser_news_input_vec = vectorizer.transform([user_news_input_processed])"

In [17]:
'''# prediction of our target
prediction = model.predict(user_news_input_vec)

if prediction[0] == 1:
    print('The news is likely to be UNRELIABLE!')
else:
    print('The news is likely to be RELIABLE!')'''

"# prediction of our target\nprediction = model.predict(user_news_input_vec)\n\nif prediction[0] == 1:\n    print('The news is likely to be UNRELIABLE!')\nelse:\n    print('The news is likely to be RELIABLE!')"

In [2]:
import numpy as np

In [7]:
result = np.random.randint(2, size=1)[0]

In [8]:
result

0