In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, precision_recall_fscore_support

import re

In [2]:
data = pd.read_csv('output.csv')

In [5]:
ps = PorterStemmer()
STOP = set(stopwords.words('english'))
textCorpus = []
numberOfDocs = len(data)

stemToTag = dict()

for _, row in data.iterrows():
  text = re.sub('[^a-zA-Z]', ' ', row['Text'])
  title = re.sub('[^a-zA-Z]', ' ', row['Title'])
  text = text.lower().split()
  title = title.lower().split()

  textArr = []
  for w in text:
    if not w in STOP and len(w) > 1:
      textArr.append(ps.stem(w))
      if ps.stem(w) not in stemToTag:
        stemToTag[ps.stem(w)] = w

  titleArr = []
  for w in title:
    if not w in STOP and len(w) > 1:
      titleArr.append(ps.stem(w))
      if ps.stem(w) not in stemToTag:
        stemToTag[ps.stem(w)] = w

  text = ' '.join(textArr)
  title = ' '.join(titleArr)

  processedText = title + ' ' + text
  textCorpus.append(processedText)

In [6]:
def tfidf_calculator(corpusIndex):
    vectorizer = CountVectorizer(max_df = 0.8, max_features = 2000)    
    word_count_vector = vectorizer.fit_transform(textCorpus)

    transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    transformer.fit(word_count_vector)

    feature_names = vectorizer.get_feature_names_out()
    stemmedArticle = textCorpus[corpusIndex]
     
    tf_idf_vector = transformer.transform(vectorizer.transform([stemmedArticle]))
    return feature_names, tf_idf_vector, stemmedArticle

In [7]:
def top_n_tags(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    results = {}
    for idx, score in sorted_items:
      results[feature_names[idx]] = score
    
    return results

In [8]:
articleToTag = input("Enter the name of the Article to Tag: ")
row = data.loc[data['Title'] == articleToTag]
if row.empty:
  print("Article not in Database")
else:
  articleIndex = data.index[data['Title'] == articleToTag].tolist()[0]
  feature_names, tf_idf_vector, stemmedArticle = tfidf_calculator(articleIndex)

  coordinate_vector = tf_idf_vector.tocoo()
  sorted_items = sorted(zip(coordinate_vector.col, coordinate_vector.data), \
                      key=lambda x: (x[1], x[0]), reverse=True)

  keywords = top_n_tags(feature_names,sorted_items,20)

  top_tags = []
  for tag in keywords.keys():
    top_tags.append(tag)

  tags = [stemToTag[s].capitalize() for s in top_tags]
  print(f"Tags for the Article: {articleToTag}")
  print(tags)

Article not in Database
