In [218]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [51]:
input_file = 'Input.xlsx'
df = pd.read_excel(input_file)

In [52]:
df

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...
...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...
97,blackassign0098,https://insights.blackcoffer.com/contribution-...
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...


## Extracting the article Title & Text 

In [111]:
def extract_article(url):
    try:
        response = requests.get(url)
        if response.status_code == 404:
            print(f"Error 404: Page not found - {url}")
            return None, None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        article_title = soup.find('title').get_text()

        match1 = soup.find('div', class_='td-post-content tagdiv-type')
        match2 = soup.find('div', class_='td_block_wrap tdb_single_content tdi_130 td-pb-border-top td_block_template_1 td-post-content tagdiv-type')

        article_text = ''
        if match1:
            article_text += match1.get_text(strip=True) + '\n'
        if match2:
            article_text += match2.get_text(strip=True)

        return article_title, article_text
    
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None

# Create a directory to save the extracted text files
output_dir = 'extracted_texts7'
os.makedirs(output_dir, exist_ok=True)

# Extract and save article text
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    article_title, article_text = extract_article(url)
    if article_text is not None and article_title is not None:
        with open(os.path.join(output_dir, f'{url_id}.txt'), 'w', encoding='utf-8') as f:
            f.write(article_title + '\n\n')
            f.write(article_text)
    else:
        # Create a text file with the 404 error message
        with open(os.path.join(output_dir, f'{url_id}.txt'), 'w', encoding='utf-8') as f:
            f.write("error 404" )
            

print("Extraction completed.")

Error 404: Page not found - https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error 404: Page not found - https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Extraction completed.


# Text Analysis

In [158]:
import nltk
import re
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amruta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [118]:
current_dir = os.getcwd()
print(current_dir)
folder_name = "extracted_texts7"
print(folder_name)

C:\Users\Amruta\Desktop\Blackcoffer NLP Project
extracted_texts7


In [119]:
folder_path = os.path.join(current_dir,folder_name)
print(folder_path)

C:\Users\Amruta\Desktop\Blackcoffer NLP Project\extracted_texts7


In [123]:
text_files = os.listdir(folder_path)
text_files

['blackassign0001.txt',
 'blackassign0002.txt',
 'blackassign0003.txt',
 'blackassign0004.txt',
 'blackassign0005.txt',
 'blackassign0006.txt',
 'blackassign0007.txt',
 'blackassign0008.txt',
 'blackassign0009.txt',
 'blackassign0010.txt',
 'blackassign0011.txt',
 'blackassign0012.txt',
 'blackassign0013.txt',
 'blackassign0014.txt',
 'blackassign0015.txt',
 'blackassign0016.txt',
 'blackassign0017.txt',
 'blackassign0018.txt',
 'blackassign0019.txt',
 'blackassign0020.txt',
 'blackassign0021.txt',
 'blackassign0022.txt',
 'blackassign0023.txt',
 'blackassign0024.txt',
 'blackassign0025.txt',
 'blackassign0026.txt',
 'blackassign0027.txt',
 'blackassign0028.txt',
 'blackassign0029.txt',
 'blackassign0030.txt',
 'blackassign0031.txt',
 'blackassign0032.txt',
 'blackassign0033.txt',
 'blackassign0034.txt',
 'blackassign0035.txt',
 'blackassign0036.txt',
 'blackassign0037.txt',
 'blackassign0038.txt',
 'blackassign0039.txt',
 'blackassign0040.txt',
 'blackassign0041.txt',
 'blackassign004

## 1.1Cleaning using Stop Words Lists

In [None]:
# creating dictionary with filename as keys and list of tokens as values

In [177]:
token_dict = {}

for file in text_files:
    with open(os.path.join(folder_path, file),'r',encoding='utf-8') as  f:
        
        tokens = word_tokenize(f.read())
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        token_dict[file] = tokens
        
token_dict

{'blackassign0001.txt': ['rising',
  'it',
  'cities',
  'and',
  'its',
  'impact',
  'on',
  'the',
  'economy',
  'environment',
  'infrastructure',
  'and',
  'city',
  'life',
  'by',
  'the',
  'year',
  'blackcoffer',
  'insights',
  'we',
  'have',
  'seen',
  'a',
  'huge',
  'development',
  'and',
  'dependence',
  'of',
  'people',
  'on',
  'technology',
  'in',
  'recent',
  'years',
  'we',
  'have',
  'also',
  'seen',
  'the',
  'development',
  'of',
  'ai',
  'and',
  'chatgpt',
  'in',
  'recent',
  'years',
  'so',
  'it',
  'is',
  'a',
  'normal',
  'thing',
  'that',
  'we',
  'will',
  'become',
  'fully',
  'dependent',
  'on',
  'technology',
  'by',
  'information',
  'technology',
  'will',
  'be',
  'a',
  'major',
  'power',
  'for',
  'all',
  'the',
  'developing',
  'nations',
  'as',
  'a',
  'member',
  'of',
  'a',
  'developing',
  'nation',
  'india',
  'is',
  'rapidly',
  'growing',
  'its',
  'it',
  'base',
  'it',
  'has',
  'also',
  'grown'

In [178]:
len(token_dict)

100

### Stopwords

In [131]:
stopword_folder = "StopWords"

folder_path2 = os.path.join(current_dir,stopword_folder)
print(folder_path2)

C:\Users\Amruta\Desktop\Blackcoffer NLP Project\StopWords


In [132]:
stopword_files = os.listdir(folder_path2)
stopword_files

['StopWords_Auditor.txt',
 'StopWords_Currencies.txt',
 'StopWords_DatesandNumbers.txt',
 'StopWords_Generic.txt',
 'StopWords_GenericLong.txt',
 'StopWords_Geographic.txt',
 'StopWords_Names.txt']

In [175]:
stop_words = set()

for file in stopword_files:
    with open(os.path.join(folder_path2, file),'r',encoding='ISO-8859-1') as  file:
        
        words = word_tokenize(file.read())
        unique_words = {word.lower() for word in words if re.match(r'^[a-zA-Z]+$', word)}
        stop_words.update(unique_words)
print(len(stop_words))
stop_words

12778


{'dewey',
 'callis',
 'mariella',
 'bobo',
 'layton',
 'messenger',
 'carley',
 'bourg',
 'velasquez',
 'pennie',
 'menchaca',
 'sheffield',
 'eller',
 'saudi',
 'wylie',
 'petterson',
 'kastner',
 'braswell',
 'far',
 'savage',
 'zelaya',
 'greco',
 'barrier',
 'neither',
 'cockerham',
 'celena',
 'breeding',
 'ned',
 'mei',
 'murrell',
 'westbrooks',
 'livia',
 'ocean',
 'kendrick',
 'hecker',
 'karina',
 'weidner',
 'paine',
 'robison',
 'mallett',
 'cisco',
 'estrada',
 'herta',
 'henson',
 'christianson',
 'jenise',
 'harlow',
 'especially',
 'safford',
 'pike',
 'bowles',
 'carrington',
 'ferrer',
 'bruns',
 'qtr',
 'jeffry',
 'kandis',
 'earls',
 'bauman',
 'marisol',
 'myrtis',
 'your',
 'wimmer',
 'edens',
 'cox',
 'caudle',
 'giordano',
 'newland',
 'marc',
 'charmain',
 'vanuatu',
 'matos',
 'emiko',
 'lucila',
 'jazmin',
 'vela',
 'petra',
 'dunne',
 'kuna',
 'cristopher',
 'delfina',
 'shavonda',
 'calhoun',
 'kyles',
 'ciara',
 'raguel',
 'leahy',
 'annually',
 'sharpe',


In [187]:
for keys, tokens in token_dict.items():
    filtered_words = token_dict[keys] 
    
    # Remove stopwords from the list of tokens
    filtered_words = [word for word in filtered_words if word not in stop_words]
    
    token_dict[keys] = filtered_words

for keys, tokens in token_dict.items():
    print(f"{keys} :",len(tokens))
token_dict

blackassign0001.txt : 428
blackassign0002.txt : 616
blackassign0003.txt : 572
blackassign0004.txt : 550
blackassign0005.txt : 307
blackassign0006.txt : 941
blackassign0007.txt : 592
blackassign0008.txt : 426
blackassign0009.txt : 532
blackassign0010.txt : 1505
blackassign0011.txt : 718
blackassign0012.txt : 773
blackassign0013.txt : 263
blackassign0014.txt : 460
blackassign0015.txt : 572
blackassign0016.txt : 572
blackassign0017.txt : 519
blackassign0018.txt : 475
blackassign0019.txt : 762
blackassign0020.txt : 198
blackassign0021.txt : 485
blackassign0022.txt : 165
blackassign0023.txt : 504
blackassign0024.txt : 239
blackassign0025.txt : 360
blackassign0026.txt : 407
blackassign0027.txt : 471
blackassign0028.txt : 469
blackassign0029.txt : 889
blackassign0030.txt : 522
blackassign0031.txt : 778
blackassign0032.txt : 589
blackassign0033.txt : 693
blackassign0034.txt : 523
blackassign0035.txt : 310
blackassign0036.txt : 1
blackassign0037.txt : 300
blackassign0038.txt : 842
blackassign00

{'blackassign0001.txt': ['rising',
  'impact',
  'economy',
  'environment',
  'infrastructure',
  'life',
  'blackcoffer',
  'insights',
  'huge',
  'development',
  'dependence',
  'people',
  'technology',
  'recent',
  'years',
  'development',
  'chatgpt',
  'recent',
  'years',
  'normal',
  'thing',
  'fully',
  'dependent',
  'technology',
  'information',
  'technology',
  'developing',
  'member',
  'developing',
  'rapidly',
  'growing',
  'base',
  'grown',
  'control',
  'centres',
  'information',
  'technology',
  'citiesnoida',
  'uttar',
  'pradesh',
  'emerging',
  'sector',
  'companies',
  'google',
  'microsoft',
  'ibm',
  'infosys',
  'set',
  'companies',
  'noida',
  'market',
  'base',
  'billions',
  'dollars',
  'great',
  'job',
  'boosting',
  'national',
  'economy',
  'establishment',
  'software',
  'companies',
  'made',
  'noida',
  'information',
  'technology',
  'haryana',
  'emerging',
  'hub',
  'companies',
  'google',
  'microsoft',
  'ibm',
  

## 1.2 Positive & Negative words from master dictionary

In [196]:
master_dictionary_folder = "MasterDictionary"

folder_path3 = os.path.join(current_dir,master_dictionary_folder)
print(folder_path3,"\n")
print("files :",os.listdir(folder_path3))

C:\Users\Amruta\Desktop\Blackcoffer NLP Project\MasterDictionary 

files : ['negative-words.txt', 'positive-words.txt']


In [198]:
positive_words = []
negative_words = []

for files in os.listdir(folder_path3):
    if files =='positive-words.txt':
        with open(os.path.join(folder_path3,files),'r',encoding='ISO-8859-1') as f:
            positive_words.extend(f.read().splitlines())
    else:
        with open(os.path.join(folder_path3,files),'r',encoding='ISO-8859-1') as f:
            negative_words.extend(f.read().splitlines())
            

In [201]:
len(positive_words),len(negative_words)

(2006, 4783)

## 1.3 Extracting Derived variables

In [211]:
positive_score_list = []
negative_score_list = []
polarity_score_list = []
subjectivity_score_list  = []

def variable_calculater(tokens):

    # Calculate positive and negative scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(-1 for word in tokens if word in negative_words)*-1
    
    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    
    # Calculate subjectivity score
    subjectivity_score = (positive_score + negative_score) / (len(filtered_words) + 0.000001)
    
    # adding scores
    positive_score_list.append(positive_score)
    negative_score_list.append(negative_score)
    polarity_score_list.append(polarity_score)
    subjectivity_score_list.append(subjectivity_score)


for tokens in token_dict.values():
    variable_calculater(tokens)

In [214]:
len(positive_score_list),len(negative_score_list)

(100, 100)

In [217]:
len(polarity_score_list),len(subjectivity_score_list)

(100, 100)