# News Classification Model

### Step 1. Read in the data

In [None]:
# install all packages needed

#!pip install gensim
#!pip install pyLDAvis
#!pip3 install openpyxl --upgrade

In [220]:
# import all libraries needed

import pandas as pd
from pandas import read_csv
import os
import openpyxl
#import jupyter_resource_usage
import re
import math
import collections
import spacy
import de_core_news_sm
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import langid

# nltk used for parsing and cleaning text
import nltk
import unicodedata
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
from scipy import spatial
from itertools import combinations

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models


## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arminberger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arminberger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
current_dir = os.getcwd()
current_dir 

'/Users/arminberger/Desktop/Code/fit5120_code'

Data overview:

train.csv: A full training dataset with the following attributes:

- id: unique id for a news article
- title: the title of a news article
- author: author of the news article
- text: the text of the article; could be incomplete
- label: a label that marks the article as potentially unreliable

1: unreliable
0: reliable

test.csv: A testing training dataset with all the same attributes at train.csv without the label.

submit.csv: A sample submission that you can

In [177]:
# read in all three data sets

train = pd.read_csv(f'{current_dir }/data_fake_news/kaggle_news_dataset/train.csv')

test = pd.read_csv(f'{current_dir }/data_fake_news/kaggle_news_dataset/test.csv')

submit = pd.read_csv(f'{current_dir }/data_fake_news/kaggle_news_dataset/submit.csv')

In [178]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [179]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [180]:
train = train[['text','label']]
train

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1
...,...,...
20795,Rapper T. I. unloaded on black celebrities who...,0
20796,When the Green Bay Packers lost to the Washing...,0
20797,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [181]:
train.text.is_unique

False

In [182]:
train.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

1: unreliable
0: reliable

Since not all values are unique we will only keep observations.

In [183]:
train.drop_duplicates(inplace = True)

In [185]:
train.shape

(20387, 2)

In [186]:
# function which checks if text is not in english
def text_is_english(text):
    
    if isinstance(text, str):
        
        if langid.classify(text)[0]!='en':
 
            return True
        

In [187]:
# check if tweet is not in english ans sa
train['bool_true'] = train.text.apply(lambda x: text_is_english(x))

# get a list of all indicese that need to be dropped
drop_index = train.index[train.bool_true == True].to_list()

In [188]:
# drop the indicese 
train.drop(drop_index, inplace = True)

In [189]:
# drop the last column
train.drop(columns = 'bool_true', inplace = True)

In [190]:
train.shape

(19881, 2)

### Step 2. Preprocess the text data

In [191]:
# save list of all stopwords
english_stop_words = stopwords.words('english')

In [192]:
# function to remove unwanted characters from as string
def remove_char(text):
    
    # list of unwated charaters 
    remove_characters = ["'",';', ':', '!', "*", '/', '\\',"(", ")",",",".","-", "&","\n",'“','@', '–', '"', '+', '=', '[',']', '?', '”']
    
    # loop through all unwated characters 
    for character in remove_characters:
                         
        text = text.replace(character, " ")
                         
    return text

In [193]:
# function that replaces accentuated characters by their non-accentuated counterparts
# in a string
def remove_accents(text):
    
    text = unicodedata.normalize('NFKD', text)
    
    return "".join([c for c in text if not unicodedata.combining(c)]) 

In [194]:
# function to clean a string and turn it into a uniform format
# we can either keep numbers or remove them
def clean_string(text):
    
    text = str(text)
    
    text = text.lower()
        
    text = text.replace("'","")
    
    text = remove_char(text)

    text = text.strip(' ')
    
    text = remove_accents(text)

    return text
   


In [195]:
news_list_unprocessed = train.text.to_list()

In [196]:
## build a vocab for cleaned/standardized keyword token, then count them 

# set for all tokens 
news_tokens = []

# loop through each keyword name 
for i in news_list_unprocessed:
    
    # only edit strings 
    if type(i) == str:
    
        # split based on white spaces and create a list of tokens
        tokens = i.split(' ')

        # loop through all the tokens
        for x in tokens:

            # clean the string
            x = clean_string(x)
             
            # check if a token is a stopword or non
            if x not in english_stop_words and x is not None:
                
                # check if a token is a number or larger than 1
                if x.isnumeric() == False and len(x) > 1:

                    # append the cleaned string
                    news_tokens.append(x)

    
# print lenght
print(len(news_tokens))

# get the count of each token accross all documents
token_frequencey = FreqDist(news_tokens)

8494176


In [197]:
token_frequencey

FreqDist({'said': 79595, 'mr': 66177, 'trump': 43854, 'one': 37093, 'would': 36883, 'people': 34853, 'new': 29502, 'like': 25653, 'also': 25175, 'president': 22947, ...})

In [198]:
# function which turns an uncleaned sting containing a number of tokens into a list of cleaned tokens 
def futher_process_string(text):
    
    # ensure that the text is in string format
    text = str(text)
    
    # don't keep numbers
    if text.isnumeric() == False:
        
        # split the string into individual tokens
        text = text.split(' ')
        
        # save a list of strings
        final_string_list = []
        
        # loop through all tokens
        for token in text:
            
            # clean the string
            token = clean_string(token)
            
            # don't keep numbers
            if token.isnumeric() == False and token not in english_stop_words:

                # ensure that the token is not None
                if  token is not None and token != '':
                    
                    # lemmatize the token 
                    token = WordNetLemmatizer().lemmatize(token)

                    # append the cleaned and lemmatized token
                    final_string_list.append(token)
        
        # return the the final string list
        return final_string_list
        

In [199]:
%%time

# preprocess all news articles
news_list_processed = [futher_process_string(x)  for x in news_list_unprocessed]

CPU times: user 2min 9s, sys: 1.01 s, total: 2min 10s
Wall time: 2min 11s


## Step 3. Vectorize text

In [214]:
# train the word2vec model using the gensim library
model = gensim.models.Word2Vec(
        news_list_processed,  # corpus used for training
        window=300,           # size of the embeddig
        min_count=2,          # min token occurance
        workers=4)   

In [215]:
model.wv.most_similar('democrat')

[('democratic', 0.5454925298690796),
 ('schumer', 0.516161322593689),
 ('chuck', 0.47959956526756287),
 ('liberal', 0.47745364904403687),
 ('pelosi', 0.4707755446434021),
 ('reid', 0.45023971796035767),
 ('democrats’', 0.4495120048522949),
 ('dianne', 0.4466678500175476),
 ('feinstein', 0.4361237585544586),
 ('progressive', 0.43353983759880066)]

In [None]:
model.