<a href="https://colab.research.google.com/github/manoj987654/datascience/blob/main/text_analysi_and_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas requests beautifulsoup4 openpyxl




In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Load  input Excel file
df_input = pd.read_excel("Input.xlsx")

# Create a directory to store extracted articles
output_text_dir = "extracted_articles"
os.makedirs(output_text_dir, exist_ok=True)

# Function to extract article title and text
def extract_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title (assuming it's in <title> or h1 tag)
            title_tag = soup.find("title") or soup.find("h1")
            title = title_tag.get_text().strip() if title_tag else "No Title"

            # Extract main article text
            article_body = soup.find("article") or soup.find("div", {"class": "post-content"})
            if article_body:
                paragraphs = article_body.find_all("p")
                article_text = "\n".join(p.get_text().strip() for p in paragraphs)
            else:
                article_text = "No Article Text Found"

            return title, article_text
        else:
            return "Error: Unable to fetch page", ""
    except Exception as e:
        return f"Error: {str(e)}", ""

# Loop through each URL, extract text, and save it
for index, row in df_input.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]

    title, article_text = extract_article_text(url)

    # Save to text file
    file_path = os.path.join(output_text_dir, f"{url_id}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(f"{title}\n\n{article_text}")

print("Extraction complete. Check the 'extracted_articles' folder for output files.")


Extraction complete. Check the 'extracted_articles' folder for output files.


In [5]:

import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from google.colab import files

# Download required NLTK resources
nltk.download("punkt")

# Upload extracted text files manually in Colab
uploaded_files = files.upload()

# Define stop words and positive/negative words (Upload these manually if available)
stop_words = set()  # Load from stopwords file if available
positive_words = set()  # Load from positive words file if available
negative_words = set()  # Load from negative words file if available

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return tokens

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouy"
    word = word.lower()
    syllable_count = 0
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                syllable_count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
    if word.endswith("e"):
        syllable_count -= 1
    return max(syllable_count, 1)

# Function to compute sentiment scores
def compute_sentiment(tokens):
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words) * -1
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

# Function to compute readability metrics
def compute_readability(text):
    sentences = sent_tokenize(text)
    words = clean_and_tokenize(text)
    complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
    avg_sentence_length = len(words) / len(sentences) if sentences else 0
    percentage_complex_words = complex_word_count / len(words) if words else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index, complex_word_count

# Function to compute additional linguistic metrics
def compute_additional_metrics(text):
    words = clean_and_tokenize(text)
    syllable_per_word = sum(count_syllables(word) for word in words) / len(words) if words else 0
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
    return len(words), syllable_per_word, personal_pronouns, avg_word_length

# Process uploaded files
text_analysis_results = []
for file_name in uploaded_files.keys():
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            text = f.read()

        tokens = clean_and_tokenize(text)
        pos_score, neg_score, pol_score, subj_score = compute_sentiment(tokens)
        avg_sent_len, perc_complex, fog_idx, comp_word_count = compute_readability(text)
        word_count, syllables, pronouns, avg_word_len = compute_additional_metrics(text)

        text_analysis_results.append([file_name, pos_score, neg_score, pol_score, subj_score,
                                      avg_sent_len, perc_complex, fog_idx, comp_word_count,
                                      word_count, syllables, pronouns, avg_word_len])
    except Exception as e:
        print(f"Error processing {file_name}: {str(e)}")

# Convert results to DataFrame
columns = ["File Name", "Positive Score", "Negative Score", "Polarity Score", "Subjectivity Score",
           "Avg Sentence Length", "Percentage of Complex Words", "Fog Index", "Complex Word Count",
           "Word Count", "Syllable per Word", "Personal Pronouns", "Avg Word Length"]
df_results = pd.DataFrame(text_analysis_results, columns=columns)

# Save to CSV
output_file = "/content/Output Data Structure.csv"
df_results.to_csv(output_file, index=False)
print(f"Analysis complete. Results saved to {output_file}")

# Provide download link
files.download(output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saving Netclan20241017.txt to Netclan20241017 (1).txt
Saving Netclan20241018.txt to Netclan20241018 (1).txt
Saving Netclan20241019.txt to Netclan20241019 (1).txt
Saving Netclan20241020.txt to Netclan20241020 (1).txt
Saving Netclan20241021.txt to Netclan20241021 (1).txt
Saving Netclan20241022.txt to Netclan20241022 (1).txt
Saving Netclan20241023.txt to Netclan20241023 (1).txt
Saving Netclan20241024.txt to Netclan20241024 (1).txt
Saving Netclan20241025.txt to Netclan20241025 (1).txt
Saving Netclan20241026.txt to Netclan20241026 (1).txt
Saving Netclan20241027.txt to Netclan20241027 (1).txt
Saving Netclan20241028.txt to Netclan20241028 (1).txt
Saving Netclan20241029.txt to Netclan20241029 (1).txt
Saving Netclan20241030.txt to Netclan20241030 (1).txt
Saving Netclan20241031.txt to Netclan20241031 (1).txt
Saving Netclan20241032.txt to Netclan20241032 (1).txt
Saving Netclan20241033.txt to Netclan20241033 (1).txt
Saving Netclan20241034.txt to Netclan20241034 (1).txt
Saving Netclan20241035.txt t

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>