In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [4]:
%%time

# Function to extract the article title and text from a URLs given in Input.xlxs file and extracting each articles into txt file

def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find and remove unwanted elements (e.g., header, footer, etc.)
        for element in soup(["header", "footer"]):
            element.decompose()
        
        # Extract article title and text
        article_title = soup.find('title').text.strip()
        article_text = ""
        
        # Extract text from <div class="td-post-content tagdiv-type">
        article_div = soup.find('div', class_='td-post-content tagdiv-type')
        if article_div:
            article_text = article_div.get_text()
        return article_title, article_text
    
    except Exception:
        print(f"Error while extracting article from {url}: {Exception}")
        return None, None

# Function to save the article title and text to a text file
    
def save_article_to_file(url_id, article_title, article_text):
    if not os.path.exists("articles"):
        os.mkdir("articles")
    
    with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(f"Title: {article_title}\n\n")
        file.write(article_text)

def main():
    input_file = "input.xlsx"
    df = pd.read_excel(input_file)
    
    for index, row in df.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]
        
        # Extract article title and text
        article_title, article_text = extract_article_text(url)
        
        # Check if extraction was successful
        if article_title and article_text:
            save_article_to_file(url_id, article_title, article_text)
            print(f"Article {url_id} extracted and saved successfully.")
        else:
            print(f"Failed to extract article {url_id}.")

if __name__ == "__main__":
    main()


Article 123.0 extracted and saved successfully.
Article 321.0 extracted and saved successfully.
Failed to extract article 2345.0.
Article 4321.0 extracted and saved successfully.
Article 432.0 extracted and saved successfully.
Article 2893.8 extracted and saved successfully.
Article 3355.6 extracted and saved successfully.
Article 3817.4 extracted and saved successfully.
Failed to extract article 4279.2.
Article 4741.0 extracted and saved successfully.
Article 5202.8 extracted and saved successfully.
Article 5664.6 extracted and saved successfully.
Article 6126.4 extracted and saved successfully.
Article 6588.2 extracted and saved successfully.
Article 7050.0 extracted and saved successfully.
Article 7511.8 extracted and saved successfully.
Article 7973.6 extracted and saved successfully.
Failed to extract article 8435.4.
Article 8897.2 extracted and saved successfully.
Article 9359.0 extracted and saved successfully.
Article 9820.8 extracted and saved successfully.
Article 10282.6 ext

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Load NLTK resources (you might need to download NLTK resources)
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Function to load positive and negative dictionaries from files
def load_dictionaries(positive_dict_file, negative_dict_file):
    with open(positive_dict_file, 'r') as file:
        positive_words = set(file.read().splitlines())
    with open(negative_dict_file, 'r') as file:
        negative_words = set(file.read().splitlines())
    return positive_words, negative_words

# Function to perform sentiment analysis and calculate scores
def calculate_sentiment_scores(text, positive_words, negative_words):
    sia = SentimentIntensityAnalyzer()
    tokens = word_tokenize(text)
    
    positive_score = 0
    negative_score = 0
    
    for word in tokens:
        # Remove punctuation and convert to lowercase
        word = word.lower()
        if word.isalpha():
            # Check if the word is in the positive dictionary
            if word in positive_words:
                positive_score += 1
            # Check if the word is in the negative dictionary
            if word in negative_words:
                negative_score += 1
    
    # Calculate sentiment analysis metrics
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

def main():
    input_data_file = "Output Data Structure.xlsx"
    positive_dict_file = "positive-words.txt"
    negative_dict_file = "negative-words.txt"
    articles_dir = "articles"
    
    # Load dictionaries
    positive_words, negative_words = load_dictionaries(positive_dict_file, negative_dict_file)

    # Read output data structure Excel file
    output_data = pd.read_excel(input_data_file)
    
    results = []
    for index, row in output_data.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]
        article_file = os.path.join(articles_dir, f"{url_id}.txt")
        
        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()
            
            # Perform sentiment analysis
            positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(article_text, positive_words, negative_words)
            
            results.append({
                "URL_ID": url_id,
                "URL": url,
                "Positive_Score": positive_score,
                "Negative_Score": negative_score,
                "Polarity_Score": polarity_score,
                "Subjectivity_Score": subjectivity_score
            })
    
    # Create DataFrame from results
    result_df = pd.DataFrame(results)
    
    # Save results to Excel
    result_df.to_excel("sentiment_analysis_results.xlsx", index=False)

if __name__ == "__main__":
    main()


In [9]:
sentiment_analysis = pd.read_excel("sentiment_analysis_results.xlsx")

In [10]:
sentiment_analysis

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,88,24,0.571429,0.060054
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,41,13,0.518519,0.079295
2,4321.0,https://insights.blackcoffer.com/rise-of-telem...,44,27,0.239437,0.051116
3,432.0,https://insights.blackcoffer.com/rise-of-telem...,44,27,0.239437,0.051116
4,2893.8,https://insights.blackcoffer.com/rise-of-chatb...,56,12,0.647059,0.051515
...,...,...,...,...,...,...
97,50921.0,https://insights.blackcoffer.com/coronavirus-i...,8,29,-0.567568,0.047619
98,51382.8,https://insights.blackcoffer.com/coronavirus-i...,27,66,-0.419355,0.049025
99,51844.6,https://insights.blackcoffer.com/what-are-the-...,100,33,0.503759,0.067036
100,52306.4,https://insights.blackcoffer.com/marketing-dri...,32,22,0.185185,0.033898


In [11]:
import re
from nltk.tokenize import sent_tokenize

In [13]:
# Function to calculate average sentence length
def calculate_avg_sentence_length(sentences):
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / total_sentences

# Function to calculate percentage of complex words
def calculate_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words) / len(words)

# Function to calculate fog index
def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to calculate average number of words per sentence
def calculate_avg_words_per_sentence(words, sentences):
    return len(words) / len(sentences)

# Function to calculate complex word count
def calculate_complex_word_count(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words)

# Function to calculate word count
def calculate_word_count(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    cleaned_words = [word for word in words if word not in stop_words and word.isalpha()]
    return len(cleaned_words)

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouAEIOU"
    count = 0
    if word[-1] in ['e', 'E'] and word[-2:] != 'le' and word[-2:] != 'LE':
        word = word[:-1]
    for index, letter in enumerate(word):
        if index == 0 and letter in vowels:
            count += 1
        elif letter in vowels and word[index-1] not in vowels:
            count += 1
    return count

# Function to calculate syllable count per word
def calculate_syllable_count_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(count_syllables(word) for word in words)
    return syllable_count / len(words)

# Function to calculate personal pronoun count
def calculate_personal_pronouns(text):
    pronouns = ["I", "we", "my", "ours", "us"]
    pattern = r'\b(?:' + '|'.join(pronouns) + r')\b'
    matches = re.findall(pattern, text)
    return len(matches)

# Function to calculate average word length
def calculate_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

def main():
    output_data_file = "Output Data Structure.xlsx"
    articles_dir = "articles"
    
    # Read output data structure Excel file
    output_data = pd.read_excel(output_data_file)
    
    results_ = []
    for index, row in output_data.iterrows():
        url_id = row["URL_ID"]
        article_file = os.path.join(articles_dir, f"{url_id}.txt")
        
        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()
            
            # Tokenize sentences for text analysis
            sentences = sent_tokenize(article_text)
            words = word_tokenize(article_text)
            
            # Calculate text analysis metrics
            avg_sentence_length = calculate_avg_sentence_length(sentences)
            percentage_complex_words = calculate_percentage_complex_words(article_text)
            fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
            avg_words_per_sentence = calculate_avg_words_per_sentence(words, sentences)
            complex_word_count = calculate_complex_word_count(article_text)
            word_count = calculate_word_count(article_text)
            syllable_count_per_word = calculate_syllable_count_per_word(article_text)
            personal_pronoun_count = calculate_personal_pronouns(article_text)
            avg_word_length = calculate_avg_word_length(article_text)
            
            results_.append({
                "URL_ID": url_id,
                "Avg_Sentence_Length": avg_sentence_length,
                "Percentage_Complex_Words": percentage_complex_words,
                "Fog_Index": fog_index,
                "Avg_Words_Per_Sentence": avg_words_per_sentence,
                "Complex_Word_Count": complex_word_count,
                "Word_Count": word_count,
                "Syllable_Count_Per_Word": syllable_count_per_word,
                "Personal_Pronoun_Count": personal_pronoun_count,
                "Avg_Word_Length": avg_word_length
            })
    
    # Create DataFrame from results
    result_df2 = pd.DataFrame(results_)
    
    # Save results to Excel
    result_df2.to_excel("text_analysis_results.xlsx", index=False)

if __name__ == "__main__":
    main()


In [14]:
text_analysis = pd.read_excel("text_analysis_results.xlsx")
text_analysis

Unnamed: 0,URL_ID,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,123.0,23.312500,0.758177,9.628271,23.312500,1414,1040,1.597319,2,5.139410
1,321.0,27.240000,0.738620,11.191448,27.240000,503,343,1.544787,3,5.098385
2,4321.0,23.150000,0.750900,9.560360,23.150000,1043,761,1.529158,6,5.051836
3,432.0,23.150000,0.750900,9.560360,23.150000,1043,761,1.529158,6,5.051836
4,2893.8,20.307692,0.750758,8.423380,20.307692,991,715,1.519697,5,4.990152
...,...,...,...,...,...,...,...,...,...,...
97,50921.0,25.900000,0.736165,10.654466,25.900000,572,414,1.435006,1,4.841699
98,51382.8,37.940000,0.721666,15.464666,37.940000,1369,1085,1.465472,2,4.732736
99,51844.6,27.943662,0.725806,11.467787,27.943662,1440,1028,1.440524,0,4.768145
100,52306.4,27.000000,0.696171,11.078468,27.000000,1109,780,1.381042,6,4.644068


In [15]:
merged_df = pd.merge(sentiment_analysis, text_analysis, on='URL_ID')
merged_df

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,88,24,0.571429,0.060054,23.312500,0.758177,9.628271,23.312500,1414,1040,1.597319,2,5.139410
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,41,13,0.518519,0.079295,27.240000,0.738620,11.191448,27.240000,503,343,1.544787,3,5.098385
2,4321.0,https://insights.blackcoffer.com/rise-of-telem...,44,27,0.239437,0.051116,23.150000,0.750900,9.560360,23.150000,1043,761,1.529158,6,5.051836
3,432.0,https://insights.blackcoffer.com/rise-of-telem...,44,27,0.239437,0.051116,23.150000,0.750900,9.560360,23.150000,1043,761,1.529158,6,5.051836
4,2893.8,https://insights.blackcoffer.com/rise-of-chatb...,56,12,0.647059,0.051515,20.307692,0.750758,8.423380,20.307692,991,715,1.519697,5,4.990152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,50921.0,https://insights.blackcoffer.com/coronavirus-i...,8,29,-0.567568,0.047619,25.900000,0.736165,10.654466,25.900000,572,414,1.435006,1,4.841699
98,51382.8,https://insights.blackcoffer.com/coronavirus-i...,27,66,-0.419355,0.049025,37.940000,0.721666,15.464666,37.940000,1369,1085,1.465472,2,4.732736
99,51844.6,https://insights.blackcoffer.com/what-are-the-...,100,33,0.503759,0.067036,27.943662,0.725806,11.467787,27.943662,1440,1028,1.440524,0,4.768145
100,52306.4,https://insights.blackcoffer.com/marketing-dri...,32,22,0.185185,0.033898,27.000000,0.696171,11.078468,27.000000,1109,780,1.381042,6,4.644068


In [16]:
merged_df.to_excel("OutputDataStructure.xlsx")