In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import keras
from keras_preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [62]:
pd.set_option('display.max_rows', None)

In [63]:
data = pd.read_csv('../data/news_articles.csv')

In [64]:
data.columns

Index(['author', 'published', 'title', 'text', 'language', 'site_url',
       'main_img_url', 'type', 'label', 'title_without_stopwords',
       'text_without_stopwords', 'hasImage'],
      dtype='object')

In [65]:
data.head(5)

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   2096 non-null   object 
 1   published                2096 non-null   object 
 2   title                    2096 non-null   object 
 3   text                     2050 non-null   object 
 4   language                 2095 non-null   object 
 5   site_url                 2095 non-null   object 
 6   main_img_url             2095 non-null   object 
 7   type                     2095 non-null   object 
 8   label                    2095 non-null   object 
 9   title_without_stopwords  2094 non-null   object 
 10  text_without_stopwords   2046 non-null   object 
 11  hasImage                 2095 non-null   float64
dtypes: float64(1), object(11)
memory usage: 196.6+ KB


In [67]:
data.describe(include='object').T

Unnamed: 0,count,unique,top,freq
author,2096,491,No Author,505
published,2096,2006,2016-10-30T13:00:00.000+02:00,8
title,2096,1784,no title,186
text,2050,1941,notify me of followup comments by email notify...,6
language,2095,5,english,2017
site_url,2095,68,wnd.com,100
main_img_url,2095,1229,No Image URL,466
type,2095,8,bs,601
label,2095,2,Fake,1294
title_without_stopwords,2094,1780,title,187


In [68]:
# Inspect other languages
print(data['language'].value_counts())
data_other_language = data[data['language'] != 'english']
data_other_language

language
english    2017
german       72
ignore        3
french        2
spanish       1
Name: count, dtype: int64


Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
848,davidduke.com,http://davidduke.com/wp-content/uploads/2014/0...,hate,httpmediaarchivesgsradionetddukemp dr duke fa...,ignore,davidduke.com,http://davidduke.com/wp-content/uploads/2016/1...,hate,Real,billion dollars jewish lobby tip iceberg,views share remarkable articleand admissionapp...,1.0
850,Staff,2016-11-20T14:58:00.000+02:00,billion dollars for the jewish lobby just the ...,views share in a remarkable articleand admiss...,ignore,davidduke.com,http://davidduke.com/wp-content/uploads/2016/1...,hate,Real,dont want break families encourage unaccompani...,share national bugle radio first step towards ...,1.0
855,Dr. Patrick Slattery,2016-11-22T10:47:49.603+02:00,national bugle radio the first step towards ov...,share \ntulsi gabbard for secretary of state a...,ignore,davidduke.com,http://davidduke.com/wp-content/uploads/2016/1...,hate,Real,dr duke pastor dankof trumps america first for...,september new homes sales rise back level davi...,1.0
936,No Author,2016-11-23T12:44:09.173+02:00,sparks fly as bikers for trump meet protesters,umfrage ceta würde enorm an zustimmung gewinne...,german,der-postillon.com,https://1.bp.blogspot.com/-fX3NNr8NmaE/Uci9s8r...,bs,Fake,weil sie sich prügelten gefängnisdirektor erte...,morgen neu kiosk postillon sonntag außerdem di...,1.0
937,noreply@blogger.com (Der Postillon),2016-10-27T10:54:56.270+03:00,studie beweist indianer kennen doch schmerz,steht ganz neben sich klonforscher nach erfolg...,german,der-postillon.com,https://3.bp.blogspot.com/-AgNqsD25l6M/WBHXDg6...,bs,Fake,umfrage ceta würde enorm zustimmung gewinnen w...,samstag oktober autofahrer entlarvt geheimen z...,1.0
938,noreply@blogger.com (Der Postillon),2016-10-27T14:08:21.816+03:00,weil sie sich prügelten gefängnisdirektor erte...,morgen neu am kiosk postillon am sonntag auße...,german,der-postillon.com,https://2.bp.blogspot.com/-sZO1SbratVQ/WBIRTor...,bs,Fake,newsticker,freitag oktober alle machen jagd auf ihn horro...,1.0
939,noreply@blogger.com (Der Postillon),2016-10-27T18:05:26.351+03:00,umfrage ceta würde enorm an zustimmung gewinne...,samstag oktober autofahrer entlarvt geheimen...,german,der-postillon.com,https://2.bp.blogspot.com/-0mdp0nZiwMI/UYwYvex...,bs,Fake,morgen neu kiosk postillon sonntag,montag oktober gruselig fc bayern verkleidet s...,1.0
940,noreply@blogger.com (Der Postillon),2016-10-28T10:34:13.418+03:00,newsticker,freitag oktober alle machen jagd auf ihn hor...,german,der-postillon.com,https://2.bp.blogspot.com/-cU9TKlQVRRg/WBSAnXO...,bs,Fake,autofahrer entlarvt geheimen zahlentrick mit d...,rausgewunken betrunkener autofahrer reagiert f...,1.0
941,noreply@blogger.com (Der Postillon),2016-10-29T16:01:22.811+03:00,morgen neu am kiosk postillon am sonntag,montag oktober gruselig fc bayern verkleidet...,german,der-postillon.com,https://1.bp.blogspot.com/-M9-qQTag-wM/Vi-k87F...,bs,Fake,alle machen jagd auf ihn horrorclown traut sic...,sonntag oktober sonntagsfrage machen sie heute...,1.0
942,noreply@blogger.com (Der Postillon),2016-10-29T15:09:00.000+03:00,autofahrer entlarvt geheimen zahlentrick mit d...,rausgewunken betrunkener autofahrer reagiert f...,german,der-postillon.com,https://1.bp.blogspot.com/-dWo3ZWGvWpE/WBMxjRv...,bs,Fake,gruselig fc bayern verkleidet sich zu hallowee...,sonntag oktober wegen ruhestörung gerufene pol...,1.0


In [69]:
# clean data
data['language'] = data['language'].replace('spanish', 'english')
data['language'] = data['language'].replace('french', 'english')
data['language'] = data['language'].replace('ignore', 'english')
data = data[data['language'] != 'german']

data['language'].value_counts()

language
english    2023
Name: count, dtype: int64