<a href="https://colab.research.google.com/github/mhmoodlan/ABC-Artificial-Bee-Colony/blob/master/code/data/download-exported-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install newspaper3k
!pip install readability-lxml
!pip install langid

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.models import load_model
from newspaper import Article
from readability import Document
from bs4 import BeautifulSoup
tf.set_random_seed(43)
np.random.seed(43)

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = '/content/drive/My Drive/NLP project/data/raw-data'

In [0]:
articles = pd.read_csv(DATA_PATH + '/our-data/hand-labeled-not-related-data/Article-2019-07-01.csv')
articles.columns

Index(['region', 'tags', 'id', 'url', 'title', 'created', 'category',
       'source__url', 'source__location', 'source__type', 'topic__name'],
      dtype='object')

In [0]:
import datetime
from dateutil import parser

tf_articles = []
for index_, article in articles.iterrows():
  
  parsed_date = parser.parse(article['created'])
  if parsed_date.month == 3 and parsed_date.day <= 24:
    continue
  tf_articles.append(article)
  

In [0]:
tmp_df = pd.DataFrame(columns=['region', 'tags', 'id', 'url', 'title', 'created', 'category',
       'source__url', 'source__location', 'source__type', 'topic__name'], data=tf_articles)
tmp_counter = 0
tmp_articles = []
for i_, row in tmp_df.iterrows():
  
  if row['created'][:10] <= '2019-03-26':
    tmp_articles.append(row)
    tmp_counter+=1
tmp_counter

386

In [0]:
tf_articles = tmp_articles

In [0]:
import langid
def is_arabic(text):
  lang,_ = langid.classify(text)
  return True if lang == 'ar' else False

In [0]:
fetched_tf_articles = []
i = 0
for url in tf_articles:
  try:
    print(i)
    i+=1
    article = Article(url['url'])
    article.download()
    parsed_article = Document(article.html)
    article_title = parsed_article.title()
    article_content = BeautifulSoup(parsed_article.summary(), 'lxml').get_text()
    if not is_arabic(article_title):
      continue       
    row = '{} {}'.format(article_title, article_content)
    fetched_tf_articles.append({'text': row, 'created': url['created']})
  except Exception as e:
    print('failed on article {}, error message: {}'.format(article.url, str(e)))

In [0]:
fetched_tf_articles_df = pd.DataFrame(columns=['text', 'created'], data=fetched_tf_articles)
fetched_tf_articles_df.to_csv(DATA_PATH+'/our-data/hand-labeled-not-related-data/Article-2019-07-01-texts.csv', index=None)

In [0]:
fetched_tf_articles_df = pd.read_csv(DATA_PATH + '/our-data/hand-labeled-not-related-data/Article-2019-07-01-texts.csv')

In [0]:
print(len(fetched_tf_articles_df))
fetched_tf_articles_df.head()

1674


Unnamed: 0,text,created
0,وسائل إعلام: 4 قتلى و8 جرحى جراء قصف الحوثيين ...,2019-06-30 22:39:12
1,أثار عاصفة سياسية.. اشتباك في جبل لبنان يودي ب...,2019-06-30 22:31:16
2,مقتل يمنيين في قصف حوثي على مدينة تعز عدن- ...,2019-06-30 21:06:54
3,غضب المتظاهرين في البصرة يطال منازل المسؤولين ...,2019-06-30 17:34:04
4,قتيلان في إطلاق نار على موكب وزير لبناني (فيدي...,2019-06-30 17:12:59


In [0]:
import nltk
nltk.download('punkt')  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk import word_tokenize
from collections import defaultdict
import pickle

model = load_model(DATA_PATH + '/../models/old-classifier-from-github/classifier_v2.h5')
itos = pickle.load(open(DATA_PATH + '/../models/old-classifier-from-github/itos.pkl', 'rb'))
stoi = defaultdict(lambda: 0, {v:k for k,v in enumerate(itos)})
def create_toks(data):
    return word_tokenize(data)

def toks2ids(toks):
    return np.array([stoi[p] for p in toks])

def vectorize_ids(ids, ndim=30002):
    result = np.zeros((1, ndim))
    result[0, ids] = 1.
    return result

def preprocess(data):
    toks = create_toks(data)
    ids = toks2ids(toks)
    res = vectorize_ids(ids)
    return res

def classify(input):
    sample = preprocess(input)
    result = model.predict(sample)
    return result[0][0] >= 0.75

In [0]:
pos = 0
neg = 0
results = []
for index_, article in fetched_tf_articles_df.iterrows():
  res = classify(article['text'])
  if res > 0:
    pos+=1
    article['label'] = 1
    results.append(article)
  else:
    neg+=1
    article['label'] = 0
    results.append(article)
print(len(fetched_tf_articles_df))
print(pos)
print(neg)
print(len(results))

1674
1518
156
1674


In [0]:
results_df = pd.DataFrame(columns=['text', 'created', 'label'], data=results)
results_df.to_csv(DATA_PATH+'/our-data/hand-labeled-not-related-data/results.csv', index=None)

In [0]:
neg_results = []
for result in results:
  if result['label'] == 0:
    neg_results.append(result)
len(neg_results)

156

In [0]:
neg_results_df = pd.DataFrame(columns=['text', 'created', 'label'], data=neg_results)
neg_results_df.head()

Unnamed: 0,text,created,label
14,"محتجو ""السترات الصفراء"" يتحدون الحر ويواصلون ا...",2019-06-29 23:08:11,0
36,أوروبا تغلي... حرارة قياسية وحرائق ووفيات | ال...,2019-06-28 21:16:34,0
43,السلطات الإثيوبية تحتجز المتحدث باسم حزب سياسي...,2019-06-28 15:35:14,0
46,انتخاب نيامكو سابوني رئيسة للحزب الليبرالي الس...,2019-06-28 12:01:32,0
55,فلسطين والمملكة المتحدة يفتتحان مبنى مجلس قروي...,2019-06-28 00:19:30,0


In [0]:
neg_results_df.to_csv(DATA_PATH+'/our-data/hand-labeled-not-related-data/neg_results.csv', index=None)

In [0]:
pos_results = []
for result in results:
  if result['label'] == 1:
    pos_results.append(result)
len(pos_results)

1518

In [0]:
pos_results_df = pd.DataFrame(columns=['text', 'created', 'label'], data=pos_results)
pos_results_df.head()

Unnamed: 0,text,created,label
0,وسائل إعلام: 4 قتلى و8 جرحى جراء قصف الحوثيين ...,2019-06-30 22:39:12,1
1,أثار عاصفة سياسية.. اشتباك في جبل لبنان يودي ب...,2019-06-30 22:31:16,1
2,مقتل يمنيين في قصف حوثي على مدينة تعز عدن- ...,2019-06-30 21:06:54,1
3,غضب المتظاهرين في البصرة يطال منازل المسؤولين ...,2019-06-30 17:34:04,1
4,قتيلان في إطلاق نار على موكب وزير لبناني (فيدي...,2019-06-30 17:12:59,1


In [0]:
pos_results_df.to_csv(DATA_PATH +'/our-data/hand-labeled-not-related-data/pos_results.csv', index=None)

In [0]:
test_data = pd.read_csv(DATA_PATH +'/our-data/hand-labeled-not-related-data/pos_results.csv')

In [0]:
test_pos = 0
test_neg = 0
test_results = []
for index_, article in test_data.iterrows():
  res = classify(article['text'])
  if res > 0:
    test_pos+=1
    article['label'] = 1
    test_results.append(article)
  else:
    test_neg+=1
    article['label'] = 0
    test_results.append(article)
print(len(test_data))
print(test_pos)
print(test_neg)
print(len(test_results))

1518
1518
0
1518
