<a href="https://colab.research.google.com/github/mcPython95/Article_type_classification/blob/main/url_article_type_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prediction of Article types from urls using Machine Learning

# New Section

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import libraries

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Suppress warnings
warnings.filterwarnings('ignore')

# Install libraries

In [55]:
!pip install newspaper3k




In [56]:
!pip install sentence-transformers




# Parent directory path

In [57]:
#declare parent directory to read and write files
parent_dir = '/content/drive/MyDrive/cape_start_accessment/'

# Read data

In [58]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(parent_dir + 'unknown_articles.csv')

# Extract the URLs
urls = df['Article.URL'].tolist()

urls[:10]

['http://australianaviation.com.au/2018/10/a-competitive-edge-50-years-of-the-australian-army-aviation-corps/',
 'http://australianaviation.com.au/2018/10/victoria-police-orders-four-aircraft-for-air-wing-fleet/',
 'http://australianaviation.com.au/2018/10/army-aviation-celebrates-50-years-farewells-the-kiowa/',
 'https://attain.news/community/special-sea-king-flypast-at-the-royal-junior-school',
 'https://m.ariva.de/amp/ad-hoc-airbus-board-of-directors-selects-guillaume-faury-7190521',
 'http://m.ariva.de/amp/u-s-army-pilots-fly-autonomous-sikorsky-helicopter-in-7227203',
 'https://www.arabianaerospace.aero/kuwait-h225-deliveries-begin.html',
 'https://www.atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html',
 'https://www.atlasinfo.fr/Des-shows-aeriens-en-cloture-du-6eme-Salon-international-de-l-aeronautique-et-du-spatial-de-Marrakech_a94912.html',
 'https://www.airmedandrescue.com/story/113203/leonardo-expands-europ

# Preprocessing

In [59]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import html

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def preprocess_text(texts, lemmatize=True, handle_html=True):
    preprocessed_texts = []

    for text in texts:
        # Handle HTML tags and entities
        if handle_html:
            text = html.unescape(text)
            text = re.sub(r'<[^>]+>', '', text)

        # Convert to lowercase (if not handled by spacy model)
        text = text.lower()

        # Replace non-standard apostrophes and other special characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # Tokenize using spacy
        doc = nlp(text)

        # Lemmatize and remove stopwords and punctuations
        words = []
        for token in doc:
            if token.is_stop:
                continue
            if token.is_punct:
                continue
            if lemmatize:
                words.append(token.lemma_)
            else:
                words.append(token.text)

        # Join the words back into a single string
        preprocessed_text = ' '.join(words)

        # Additional check to remove leading and trailing spaces
        preprocessed_text = preprocessed_text.strip()

        preprocessed_texts.append(preprocessed_text)

    return preprocessed_texts


# Feature extraction using SentenceTransformer


In [60]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SentenceBERT model
st_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load model

In [61]:
import pickle

# Load the saved model
with open(parent_dir + 'article_type_classifier_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Load class names
with open(parent_dir + 'class_names.pkl', 'rb') as file:
    class_names = pickle.load(file)

# Heading and Full artile extraction using newspaper3k

In [65]:
import pandas as pd
from newspaper import Article

no_of_urls_to_extract = 50

def extract_article_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()

        heading = article.title
        full_article = article.text

        return heading, full_article
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return None, None


# Extract content for all URLs
data = []
for url in urls[:no_of_urls_to_extract]:
    heading, full_article = extract_article_content(url)
    if heading and full_article:
        data.append((url, heading, full_article))

# Create a DataFrame with the extracted data
extracted_df = pd.DataFrame(data, columns=['URL', 'Heading', 'Full_Article'])

# Display the DataFrame
extracted_df

Error extracting https://attain.news/community/special-sea-king-flypast-at-the-royal-junior-school: Article `download()` failed with 404 Client Error: Not Found for url: https://attain.guide/community/special-sea-king-flypast-at-the-royal-junior-school on URL https://attain.news/community/special-sea-king-flypast-at-the-royal-junior-school
Error extracting https://www.atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html: Article `download()` failed with 404 Client Error: Not Found for url: https://atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html on URL https://www.atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html
Error extracting http://evtol.news/2018/11/05/bell-furthers-multiple-evtol-efforts/: Article `download()` failed with 500 Server Error: Internal Server Error for url: http

Unnamed: 0,URL,Heading,Full_Article
0,http://australianaviation.com.au/2018/10/a-com...,A competitive edge – 50 years of the Australia...,The Army formally celebrated 50 years of the A...
1,http://australianaviation.com.au/2018/10/victo...,Victoria Police orders four aircraft for Air W...,Victoria Police Air Wing is replacing its four...
2,http://australianaviation.com.au/2018/10/army-...,"Army Aviation celebrates 50 years, farewells t...",You're out of free articles for this month.\n\...
3,https://m.ariva.de/amp/ad-hoc-airbus-board-of-...,Ad hoc: Airbus Board of Directors Selects Guil...,DGAP-Ad-hoc: Airbus SE / Key word(s): Change o...
4,http://m.ariva.de/amp/u-s-army-pilots-fly-auto...,U.S. Army Pilots Fly Autonomous Sikorsky Helic...,"Montag, 29.10.2018 21:20 von PR Newswire\n\nFO..."
5,https://www.arabianaerospace.aero/kuwait-h225-...,Kuwait H225 deliveries begin,"The ministry’s undersecretary, Lieutenant Gene..."
6,https://www.atlasinfo.fr/Des-shows-aeriens-en-...,Des shows aériens en clôture du 6ème Salon int...,"Ainsi, les milliers de personnes, toutes catég..."
7,https://www.airmedandrescue.com/story/113203/l...,Leonardo expands in Europe with new deals,Helicopter manufacturer Leonardo is looking to...
8,https://www.airmedandrescue.com/story/113208/b...,Bell expands Bell 505 pilot global training ca...,Bell Helicopter has announced the launch of Be...
9,https://www.airmedandrescue.com/story/113231/a...,Airbus expands North American presence,Airbus Helicopters has expanded its presence i...


# Predictions

In [63]:
print("-"*50)
print("\t Predictions :")
print("-"*50)
for index, row in extracted_df.iterrows():
      heading = row['Heading']
      full_article = row['Full_Article']
      url = row['URL']

      pre_processed_text =  list(map(preprocess_text,[[heading],[full_article]]))

      # Vectorize the text columns
      head_emd = st_model.encode(pre_processed_text[0][0])
      art_emd = st_model.encode(pre_processed_text[1][0])

      X_new = np.hstack((head_emd, art_emd))

      # Make predictions
      y_pred = loaded_model.predict([X_new])

      predicted_label = class_names[y_pred[0]]

      print("-"*50)

      print(f"Heading: {heading}")

      print(f"URL : {url}")

      print("Predicted label:", predicted_label)
      print("-"*50)

--------------------------------------------------
	 Predictions :
--------------------------------------------------
--------------------------------------------------
Heading: A competitive edge – 50 years of the Australian Army Aviation Corps
URL : http://australianaviation.com.au/2018/10/a-competitive-edge-50-years-of-the-australian-army-aviation-corps/
Predicted label: Military
--------------------------------------------------
--------------------------------------------------
Heading: Victoria Police orders four aircraft for Air Wing fleet
URL : http://australianaviation.com.au/2018/10/victoria-police-orders-four-aircraft-for-air-wing-fleet/
Predicted label: Commercial
--------------------------------------------------
--------------------------------------------------
Heading: Army Aviation celebrates 50 years, farewells the Kiowa
URL : http://australianaviation.com.au/2018/10/army-aviation-celebrates-50-years-farewells-the-kiowa/
Predicted label: Military
---------------------