<a href="https://colab.research.google.com/github/ojuskumar/WEB-SCRAPING-ARTICLE-AND-SIMULTANEOUS-NLP/blob/main/Web_scraping_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import syllables

# Step 1: Data Extraction
# Read input.xlsx
input_file = "/content/Input.xlsx"
output_structure_file = "/content/Output Data Structure.xlsx"

df_input = pd.read_excel(input_file)

# Create an empty DataFrame to store the extracted data
extracted_data = pd.DataFrame(columns=['URL_ID', 'URL', 'Article_Text'])

for index, row in df_input.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Make a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article text
        article_text = " ".join([p.text for p in soup.find_all('p')])  # adjust based on HTML structure

        # Append the extracted data to the DataFrame
        extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)

# Save the extracted data to a new DataFrame
extracted_data.to_excel("extracted_data.xlsx", index=False)

# Step 2: Data Analysis
# Read the output structure file
output_df = pd.read_excel(output_structure_file)

# Function to calculate syllables in a word
def syllable_count(word):
    return syllables.estimate(word)

# Function to calculate the Flesch Reading Ease (FRE) score
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to perform text analysis
def perform_text_analysis(row):
    blob = TextBlob(row['Article_Text'])
    word_count = len(blob.words)
    sentence_count = len(blob.sentences)
    avg_sentence_length = word_count / sentence_count
    complex_word_count = sum(1 for word in blob.words if syllable_count(word) > 2)
    percentage_complex_words = (complex_word_count / word_count) * 100
    fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)

    return {
        'POSITIVE SCORE': blob.sentiment.polarity,  # example sentiment analysis, replace with your method
        'NEGATIVE SCORE': -blob.sentiment.polarity,  # example sentiment analysis, replace with your method
        'POLARITY SCORE': blob.sentiment.polarity,
        'SUBJECTIVITY SCORE': blob.sentiment.subjectivity,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': word_count / sentence_count,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': sum(syllable_count(word) for word in blob.words) / word_count,
        'PERSONAL PRONOUNS': blob.word_counts['I'] + blob.word_counts['me'] + blob.word_counts['my'] + blob.word_counts['mine'] + blob.word_counts['myself'],
        'AVG WORD LENGTH': sum(len(word) for word in blob.words) / word_count,
        'URL_ID': row['URL_ID'],
        'URL': row['URL'],
    }

# Apply text analysis to each row in the output DataFrame
output_df = pd.concat([output_df, extracted_data.apply(perform_text_analysis, axis=1).apply(pd.Series)], axis=1)

# Save the final output DataFrame
output_df.to_excel("output_results.xlsx", index=False)


  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extracted_data.append({'URL_ID': url_id, 'URL': url, 'Article_Text': article_text}, ignore_index=True)
  extracted_data = extra

In [None]:
pip install syllables

Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.16-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<7.0,>=5.1 (from syllables)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata, cmudict, syllables
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 7.0.1
    Uninstalling importlib-metadata-7.0.1:
      Successfully uninstalled importlib-metadata-7.0.1
Successfully installed cmudict-1.0.16 importlib-metadata-6.11.0 syllables-1.0.9


In [None]:
!pip install textblob
!python -m textblob.download_corpora


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.
