In [28]:
!pip install fuzzywuzzy



In [29]:
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm import tqdm

In [30]:
# Load the news articles
news_data = pd.read_csv('/content/filtered_news.csv')
print(news_data.columns)
#news_data = news_data['News']
# Load the features
features = pd.read_csv('/content/Feature_Pool.csv')['Article'].tolist()
news_data[:1]

Index(['Headline', 'DateTime', 'Source', 'News', 'HeadNews_clean'], dtype='object')


Unnamed: 0,Headline,DateTime,Source,News,HeadNews_clean
0,Are GDP growth prospects bright?,2021-02-08,Tribune,"Despite many positives, LNG crisis and fiscal ...",are GDP growth prospects bright? despite many ...


In [32]:
# Feature Tagging Tool
def tag_features(article, features, similarity_threshold=80):
    tags = []
    for feature in features:
        similarity = fuzz.partial_ratio(feature.lower(), article.lower())
        if similarity >= similarity_threshold:
            tags.append(feature)
    return tags

# News Filtering Tool
def filter_cement_news(article, tags):
    cement_keywords = ['cement', 'construction', 'infrastructure', 'building materials']
    return any(keyword in article.lower() for keyword in cement_keywords) or len(tags) > 0

# Function to calculate similarity between articles using FuzzyWuzzy
def calculate_similarity(articles, threshold=80):
    similar_pairs = []
    for i in tqdm(range(len(articles))):
        for j in range(i+1, len(articles)):
            similarity = fuzz.ratio(articles.iloc[i], articles.iloc[j])
            if similarity > threshold:
                similar_pairs.append((i, j, similarity))
    return similar_pairs

# Function to remove similar articles
def remove_similar_articles(df, similar_pairs):
    to_remove = set()
    for i, j, _ in similar_pairs:
        if len(df.iloc[i]['News']) >= len(df.iloc[j]['News']):
            to_remove.add(j)
        else:
            to_remove.add(i)
    return df.drop(df.index[list(to_remove)])

# Integrated Pipeline
def news_processing_pipeline(news_data, features):
    print("Step 1 & 2: Feature Tagging")
    news_data['Tags'] = news_data['News'].apply(lambda x: tag_features(x, features))

    print("Step 3 & 4: News Filtering")
    news_data['Relevant'] = news_data.apply(lambda row: filter_cement_news(row['News'], row['Tags']), axis=1)
    filtered_news = news_data[news_data['Relevant']]

    print("Step 5: Calculating Similarity")
    similar_pairs = calculate_similarity(filtered_news['News'])

    print("Step 6: Removing Similar Articles")
    final_news = remove_similar_articles(filtered_news, similar_pairs)

    return final_news[['News', 'Tags']]

# Run the pipeline
processed_news = news_processing_pipeline(news_data, features)

# Save the processed news to a CSV file
processed_news.to_csv('processed_news.csv', index=False)

print("News processing pipeline completed. Results saved to 'processed_news.csv'.")

# Print statistics
total_articles = len(processed_news)
articles_with_tags = len(processed_news[processed_news['Tags'].apply(len) > 0])
print(f"\nTotal relevant articles: {total_articles}")
print(f"Articles with tags: {articles_with_tags}")
print(f"Percentage of articles with tags: {articles_with_tags/total_articles*100:.2f}%")

# Print a sample of the processed news with tags
print(processed_news.head().to_string())

Step 1 & 2: Feature Tagging
Step 3 & 4: News Filtering
Step 5: Calculating Similarity


100%|██████████| 160/160 [03:52<00:00,  1.46s/it]

Step 6: Removing Similar Articles
News processing pipeline completed. Results saved to 'processed_news.csv'.

Total relevant articles: 154
Articles with tags: 56
Percentage of articles with tags: 36.36%
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             


