# New section

In [None]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Install necessary libraries if not already installed
!pip install pandas requests beautifulsoup4

# Replace 'Input.xlsx' with the actual path to your file if needed
input_file = 'Input.xlsx'

try:
    # Load the Excel file into a pandas DataFrame
    df = pd.read_excel(input_file)

    # Create a directory to store the extracted text files
    output_dir = 'extracted_articles'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate through each row (article) in the DataFrame
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes

            soup = BeautifulSoup(response.content, 'html.parser')

            # ---  Content Extraction Logic (Adapt this part for different websites) ---
            # Example 1: Find the main article content using a common class or tag
            article_content = soup.find('article') # or soup.find('div', class_='article-body')

            # Example 2: If the above doesn't work, try a more specific approach (Inspect element to find article body)
            if article_content is None:
              article_content = soup.find('div', {'id': 'article-body'}) # Or any relevant identifier

            #Fallback if both of the above failed
            if article_content is None:
              article_content = soup.find('div', class_='main-content')

            if article_content:
              #Extract article title
              article_title = soup.find('h1')
              title_text = article_title.text.strip() if article_title else "No Title Found"

              # Extract text from the article
              text = ''
              for paragraph in article_content.find_all(['p', 'h2']):  # Include other tags as needed
                  text += paragraph.get_text(strip=True) + '\n'

              # Save the extracted text to a file
              with open(os.path.join(output_dir, f'{url_id}.txt'), 'w', encoding='utf-8') as f:
                  f.write(title_text + '\n\n' + text)

            else:
                print(f"Could not extract article content for URL_ID: {url_id}, URL:{url}")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {url}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred for URL_ID: {url_id}, URL:{url}: {e}")

except FileNotFoundError:
    print(f"Error: Input file '{input_file}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")



In [43]:


import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

# Download required NLTK data (only needed once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    word_tokenize("test")
except LookupError:
    nltk.download('punkt')


def analyze_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w.lower() for w in tokens if w.isalnum() and w.lower() not in stop_words]

    # Calculate variables
    positive_score = 0
    negative_score = 0
    polarity_score = 0
    subjectivity_score = 0
    avg_sentence_length = 0
    percentage_complex_words = 0
    fog_index = 0
    avg_number_of_words_per_sentence = 0
    complex_word_count = 0
    word_count = len(filtered_tokens)
    syllable_count_per_word = 0
    personal_pronouns = 0
    avg_word_length = 0

    if word_count > 0:
      # TextBlob analysis
      analysis = TextBlob(text)
      polarity_score = analysis.sentiment.polarity
      subjectivity_score = analysis.sentiment.subjectivity

      # Sentence Length and Complex Word calculations (simplified examples)
      sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) # Split into sentences
      avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences) if len(sentences) > 0 else 0

      #Counting Complex words (words with more than 2 syllables)
      for word in filtered_tokens:
        vowels = "aeiouy"
        syllable_count = 0
        for i in range(len(word)):
          if word[i] in vowels:
            syllable_count += 1
        if syllable_count > 2:
          complex_word_count += 1

      percentage_complex_words = (complex_word_count/ word_count) * 100 if word_count > 0 else 0

      fog_index = 0.4*(avg_sentence_length + percentage_complex_words) # Simplified fog index calculation

      avg_number_of_words_per_sentence = avg_sentence_length

      #Syllable Count per word (approximation)
      syllable_count_per_word = sum(len(re.findall(r'[aeiouy]+', word, re.IGNORECASE)) for word in filtered_tokens) / len(filtered_tokens) if len(filtered_tokens) > 0 else 0

      #Counting personal pronouns
      for word in tokens:
          if word.lower() in ["i", "we", "my", "ours", "us"]:
              personal_pronouns+=1

      #Average word length
      avg_word_length = sum(len(word) for word in filtered_tokens) / len(filtered_tokens) if len(filtered_tokens) > 0 else 0

      #Adjusting polarity and subjectivity score to fit into the desired scale
      positive_score = max(0, polarity_score)
      negative_score = max(0, -polarity_score)


    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_number_of_words_per_sentence, complex_word_count, word_count,
            syllable_count_per_word, personal_pronouns, avg_word_length]


# Output Data Structure
output_df = pd.DataFrame(columns=['URL_ID', 'positive_score', 'negative_score', 'polarity_score', 'subjectivity_score',
                                  'avg_sentence_length', 'percentage_complex_words', 'fog_index',
                                  'avg_number_of_words_per_sentence', 'complex_word_count', 'word_count',
                                  'syllable_per_word', 'personal_pronouns', 'avg_word_length'])

output_directory = 'extracted_articles'  # Directory with the extracted text files
for filename in os.listdir(output_directory):
    if filename.endswith(".txt"):
        url_id = filename[:-4]  # Remove the .txt extension

        with open(os.path.join(output_directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()

        results = analyze_text(text)
        new_row = {'URL_ID': url_id, 'positive_score': results[0], 'negative_score':results[1], 'polarity_score':results[2], 'subjectivity_score':results[3],
                  'avg_sentence_length':results[4], 'percentage_complex_words':results[5], 'fog_index':results[6],
                  'avg_number_of_words_per_sentence':results[7], 'complex_word_count':results[8], 'word_count':results[9],
                  'syllable_per_word':results[10], 'personal_pronouns':results[11], 'avg_word_length':results[12]}

        output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)

# Save the results to an Excel file
output_df.to_excel('Output_Data.xlsx', index=False)


  output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)


In [44]:
# prompt: Definition of each of the variables given in the “Text Analysis.docx” file.
# Look for these variables in the analysis document (Text Analysis.docx):
# 1.	POSITIVE SCORE
# 2.	NEGATIVE SCORE
# 3.	POLARITY SCORE
# 4.	SUBJECTIVITY SCORE
# 5.	AVG SENTENCE LENGTH
# 6.	PERCENTAGE OF COMPLEX WORDS
# 7.	FOG INDEX
# 8.	AVG NUMBER OF WORDS PER SENTENCE
# 9.	COMPLEX WORD COUNT
# 10.	WORD COUNT
# 11.	SYLLABLE PER WORD
# 12.	PERSONAL PRONOUNS
# 13.	AVG WORD LENGTH
# /content/Text Analysis.docx

# Definition of variables as per the provided Python code:

# 1. POSITIVE SCORE:  The positive score is derived from the polarity score.  It represents the positive sentiment expressed in the text.
#    It's calculated as the maximum of 0 and the polarity score.  A higher positive score indicates more positive sentiment.  It's adjusted from the polarity score of TextBlob to fit a scale where negative score cannot be negative.

# 2. NEGATIVE SCORE: The negative score is also derived from the polarity score. It represents the negative sentiment in the text.
#    It's calculated as the maximum of 0 and the absolute value of the negative polarity score.  A higher negative score indicates more negative sentiment.  It's adjusted from the polarity score of TextBlob to fit a scale where negative score cannot be negative.

# 3. POLARITY SCORE:  This represents the overall sentiment expressed in the text.  It is calculated using TextBlob's sentiment analysis.
#    The polarity score is a float within the range [-1.0, 1.0], where -1.0 represents extremely negative sentiment, 1.0 represents extremely positive sentiment, and 0.0 represents neutral sentiment.

# 4. SUBJECTIVITY SCORE: This score represents the subjectivity of the text, i.e., how much personal opinion or judgment is present.
#    It's calculated using TextBlob's sentiment analysis.  The subjectivity score is a float within the range [0.0, 1.0], where 0.0 is very objective and 1.0 is very subjective.

# 5. AVG SENTENCE LENGTH: The average number of words per sentence in the text. It's calculated by summing the number of words in each sentence and dividing by the total number of sentences.

# 6. PERCENTAGE OF COMPLEX WORDS: The percentage of words in the text that are considered complex.  In this code, a complex word is defined as a word with more than two syllables. The code uses a vowel counting heuristic to approximate syllable count.

# 7. FOG INDEX:  A readability score calculated using the formula 0.4 * (average sentence length + percentage of complex words). It estimates the years of formal education needed to understand the text.  This code uses a simplified formula.

# 8. AVG NUMBER OF WORDS PER SENTENCE:  This is the same as the average sentence length (point 5).

# 9. COMPLEX WORD COUNT: The total number of words in the text that are classified as complex (more than two syllables, approximated by vowel count).

# 10. WORD COUNT: The total number of words in the text after removing stop words and punctuation.

# 11. SYLLABLE PER WORD: The average number of syllables per word in the text.  The syllable count is approximated using a vowel counting heuristic.

# 12. PERSONAL PRONOUNS: The number of personal pronouns (e.g., "I," "we," "my," "ours," "us") found in the text.

# 13. AVG WORD LENGTH: The average length (in characters) of the words in the text.

In [59]:
pip install pandas nltk textblob syllapy openpyxl




In [65]:
!wget https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt
!wget https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from syllapy import count as syllable_count

# Load NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load positive and negative word lists, specifying encoding
with open('positive-words.txt', 'r', encoding='latin-1') as f:  # Change encoding if needed
    positive_words = set(f.read().split())
with open('negative-words.txt', 'r', encoding='latin-1') as f:  # Change encoding if needed
    negative_words = set(f.read().split())

# Load input variables from Input.xlsx
input_path = "Input.xlsx"
input_df = pd.read_excel(input_path)

def analyze_text(text):
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Metrics
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(filtered_words) + 0.000001)
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0
    word_count = len(filtered_words)
    syllable_count_total = sum(syllable_count(word) for word in filtered_words)
    syllable_per_word = syllable_count_total / word_count if word_count else 0
    complex_words = [word for word in filtered_words if syllable_count(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_word_length = sum(len(word) for word in filtered_words) / word_count if word_count else 0
    personal_pronouns = sum(1 for word in words if word.lower() in ["i", "we", "my", "ours", "us"])

    # Return metrics
    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word]

--2024-11-27 07:31:35--  https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20630 (20K) [text/plain]
Saving to: ‘positive-words.txt.1’


2024-11-27 07:31:35 (34.8 MB/s) - ‘positive-words.txt.1’ saved [20630/20630]

--2024-11-27 07:31:35--  https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sen

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from syllapy import count as syllable_count

# Load NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load positive and negative word lists, specifying encoding
with open('positive-words.txt', 'r', encoding='latin-1') as f:  # Change encoding if needed
    positive_words = set(f.read().split())
with open('negative-words.txt', 'r', encoding='latin-1') as f:  # Change encoding if needed
    negative_words = set(f.read().split())
# Load input variables from Input.xlsx
input_path = "Input.xlsx"
input_df = pd.read_excel(input_path)

def analyze_text(text):
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Metrics
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(filtered_words) + 0.000001)
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0
    word_count = len(filtered_words)
    syllable_count_total = sum(syllable_count(word) for word in filtered_words)
    syllable_per_word = syllable_count_total / word_count if word_count else 0
    complex_words = [word for word in filtered_words if syllable_count(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_word_length = sum(len(word) for word in filtered_words) / word_count if word_count else 0
    personal_pronouns = sum(1 for word in words if word.lower() in ["i", "we", "my", "ours", "us"])

    # Return metrics
    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_sentence_length, complex_word_count, word_count,
            syllable_per_word, personal_pronouns, avg_word_length]

# Initialize output DataFrame with columns matching the desired format
output_columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                  'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

output_df = pd.DataFrame(columns=output_columns)

# Directory containing extracted articles
output_directory = 'extracted_articles'

# Process each file
for _, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    file_path = os.path.join(output_directory, f"{url_id}.txt")

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        results = analyze_text(text)
        new_row = {
            'URL_ID': url_id, 'URL': url, 'POSITIVE SCORE': results[0], 'NEGATIVE SCORE': results[1],
            'POLARITY SCORE': results[2], 'SUBJECTIVITY SCORE': results[3],
            'AVG SENTENCE LENGTH': results[4], 'PERCENTAGE OF COMPLEX WORDS': results[5],
            'FOG INDEX': results[6], 'AVG NUMBER OF WORDS PER SENTENCE': results[7],
            'COMPLEX WORD COUNT': results[8], 'WORD COUNT': results[9],
            'SYLLABLE PER WORD': results[10], 'PERSONAL PRONOUNS': results[11],
            'AVG WORD LENGTH': results[12]
        }
        output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)

# Save the output to Excel
output_file = "Output_Data.xlsx"
output_df.to_excel(output_file, index=False)

print(f"Analysis complete. Results saved to {output_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)


Analysis complete. Results saved to Output_Data.xlsx


In [68]:
# prompt: convert output_data .xlsx into csv

import pandas as pd

# Load the Excel file into a pandas DataFrame
excel_file = 'Output_Data.xlsx'
df = pd.read_excel(excel_file)

# Save the DataFrame to a CSV file
csv_file = 'output_data.csv'
df.to_csv(csv_file, index=False)