In [1]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 


In [2]:
# The data is from https://www.kaggle.com/yufengdev/bbc-fulltext-and-category?select=bbc-text.csv
articles = pd.read_csv(os.path.join(os.getcwd(), "Articles.csv")) 
articles

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [3]:
stop = stopwords.words('english')
newStopWords = ['said','also',"one","two","three","four","five",'u','would',"second","first","monday","tuesday","wednesday","thursday","friday","saturday",'sunday','mr',"mrs",'ms',"name"]
stop.extend(newStopWords)
def word_count(identifier, individual=False):
    #The next 4 lines of code are from https://www.kaggle.com/edhirif/word-cloud-alternative-using-nltk
    if individual==False:
        article_str = articles[articles["category"]==identifier]["text"].str.cat(sep = ' ') #Takes each row of the reviews dataframe and puts it into a single string. Each review is separated with a space.
    else: 
        article_str=articles.loc[identifier]["text"]
    list_of_words = [word.lower() for word in wordpunct_tokenize(article_str) if word.lower() not in stop and word.isalpha()] #Lowercases each word in every article if it is not in the list of stop words and all the characters in the word are letters.
    wordfreqdist = nltk.FreqDist(list_of_words)#Counts how frequent each word is.
    mostcommon = wordfreqdist.most_common(25) #creates a list of the 25 most common words.
    return mostcommon


# Using TF-IDF to Find the Keywords of Each Article
### While we will be using functions to calculate the TF-IDF values of each word in an article, it would be wise to review the logic behind the functions.
## Finding the IDF weights. 
### We will first find the most common words in the collection of articles. The more common the word, the smaller its IDF weight and consequently the smaller its TF-IDF value will be. <br>While there is a function that calculates the IDF weights for us, it would be wise to review the logic behind it. <br>The IDF [formula](http://www.tfidf.com/) is as follows: IDF(term) = log_e(Total number of documents / Number of documents with term t in it). In other words, the IDF weight for each term is calculated by taking the log of the total number of documents divided by the number of documents where the term is present. As a word appears in more documents, it becomes less helpful in differentiating between articles, and thus will have a smaller IDF weight.
## Finding the TF weights
###  The TF score for each word. The program will find this using the following [formula](http://www.tfidf.com/): TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document). The more frequently a word appears in a document, the higher its TF score will be.
## Finding the TF-IDF weights. 
### To find the TF-IDF weights, we will be using the following formula: TF-IDF=TF * IDF.  The words with the highest TF-IDF weights will be our keywords.
#### This will be accomplished by following the following [guide](https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.YTZxp9NKj0t). Unless otherwise states, the code will be from this guide.

### We will first calculate the IDF weights for all the words in the corpus. 

In [4]:
def idf_calculator(category):
    cv=CountVectorizer(stop_words=stop) #Initializes a CountVectorizer.
    word_count_vector=cv.fit_transform(articles[articles["category"]==category]['text'])#Uses the CountVectorizer to count the frequency of each word in the corpus.
    #We will initialize a TfidTransformer object, which will "Transform a count matrix [word_count_vector] to a normalized tf or tf-idf representation." In other words, it will help us display the tf-idf values
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
    tfidf_transformer.fit(word_count_vector) #Fitting the transformer to our count matrix. This will allow us to find the IDF weights for each word based on their frequency.
    #tfidf_transformer has an attribute idf_ which returns an array of the idf weights. We will use this to create a dataframe that displays the weights for each word. 
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
    df_idf.sort_values(by=['idf_weights']) 
    return cv,word_count_vector,tfidf_transformer


## Finding the TF and TF-IDF weights for the words in an article.
### We will now run a test to see how well our program does at finding the key words of an article. We will use the first article in the corpus. This article discusses a kerfuffle between Michael Howard and Peter Hain, promiment members of the Conservative party and Labor party respectively. 

In [5]:
def tf_idf(doc,category):  
    tf_idf_vector=tfidf_transformer.transform(word_count_vector) 
    document_vector=tf_idf_vector[doc] #This finds the TF values for each word in the first document, and multiplies it by that word's IDF value.
    feature_names=cv.get_feature_names() #This is sipmly a list of the words in the article.
    #To create a dataframe from the values, we need to represent document_vector, which is a sparse matrix, as a dense matrix. 
    df = pd.DataFrame(document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
    df=df.sort_values(by=["tfidf"],ascending=False) 
    print ("Here is the article text: ",articles[articles["category"]==category]["text"].iloc[doc])
    #The program will now ask the user if they are satisifed with the keywords. 
    #If they are not, they can remove them one by one and replace them with the next highest ranking word.
    #They can do this until they are satisfied with the keywords. 
    response=None
    while response!="no":
        kws=list(df.head()['tfidf'].index) #Creates a list of the 5 words with the highest TF-IDF scores.
        print ("The keywords for the article are: ",kws)
        response=input("Would you like to remove any of these keywords? If you do, the word with the next highest TF-IDF score will be selected to replace it. If you are satisfied with the keywords presented, please type 'no'. \n")
        response=response.replace(" ", "") #Removes spaces from the user's input
        if response in kws: 
            #If their response was in the list,then the word is removed from the dataframe. 
            #This isn't permanent, however. Whenever this cell is ran, the dataframe is reset to its original form. 
            df=df.drop(response, axis=0)
        elif response=="no":
            print ("Cool!")
        else: #If the user doesn't say 'no' or says a word that is not in the list of keywords.s
            print ("Sorry, I didn't catch that.\n")

(cv, word_count_vector,tfidf_transformer)=idf_calculator("politics")
tf_idf(0,"politics")


Here is the article text:  howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory leader is acting like an  attack mongrel  shows labour is  rattled  by the opposition.  in an upbeat speech to his party s spring conference in brighton  he said labour s campaigning tactics proved the tories were hitting home. mr hain made the claim about tory tactics in the anti-terror bill debate.  something tells me that someone  somewhere out there is just a little bit rattled   mr howard said. mr hain  leader of the commons  told bbc radio four s today programme that mr howard s stance on the government s anti-terrorism legislation was putting the country at risk. he then accused the tory leader of behaving like an  attack mongrel  and  playing opposition for opposition sake .  mr howard told his party that labour would  do anything  say anything  claim anything to cling on to office at all costs .  so far this year they have compared me to fagin  to shylock and

## The program does a good job of finding descriptive keywords for the article. The article discusses Howard's response to Hain calling him a mongrel. Howard says that Hain is calling him a mongrel, and has called him other names in the past, because the labor party is "rattled by the opposition." The rest of the article provides a background behind the tension between these two parties. The keywords tell us who the article is about and references prominent talking points from the article. However, the 5th keyword, "bit" is not very descriptive, as it does not play a large role in the article. It was likely chosen due to it's large IDF value. As a result, the user has the option to remove it from the list and replace it next highest ranking word.

In [8]:
"""
The program asks the user for the category so that the IDF calculator knows what to filter the dataframe by. 
Different types of articles will likely have different commonly used words. This is important because it
could offset the IDF score. A word like "country" would likely be common in political articles, but not as common 
in tech-related articles. As a result, the word "country" would have a higher IDF score if we just used political 
articles than had we used all articles. The former is better because it tells us that most political articles include
the word "country", so if its common in a specific article, it won't necessarily make a good keyword. Had we used
all articles, the IDF score (and consequently TF-IDF score) for "country" would be higher, making it more likely
to be selected as a keyword for an article despite not being very descriptive.
"""
valid=False
while valid==False:   
    print ("To view the keywords of an article, select a category, then select an article.")
    print (list(articles["category"].unique()))
    category=input("Please choose a category, or type 'cancel'. ").replace(" ", "").lower()
    if category!="cancel":
        filtered_articles=articles[articles["category"]==category].reset_index(drop=True) 
        display(filtered_articles)
        num=input("Here are the available articles from this category, please select an article by its index. ").replace(" ", "").lower()
        try: 
            num=int(num)
            (cv, word_count_vector,tfidf_transformer)=idf_calculator(category) #This calculates the IDF score for all articles in the category selected by the user. 
            tf_idf(num,category) #This obtains the TF-IDF scores for all the articles in the category the user selected.
            valid=True
        except:
            print ("Hmmm. It seems like an invalid number was provided. Please try again.")
    else:
        break

To view the keywords of an article, select a category, then select an article.
['tech', 'business', 'sport', 'entertainment', 'politics']
Please choose a category, or type 'cancel'. sport


Unnamed: 0,category,text
0,sport,tigers wary of farrell gamble leicester say ...
1,sport,yeading face newcastle in fa cup premiership s...
2,sport,henman hopes ended in dubai third seed tim hen...
3,sport,wilkinson fit to face edinburgh england captai...
4,sport,moya emotional after davis cup win carlos moya...
...,...,...
506,sport,newry to fight cup exit in courts newry city a...
507,sport,owen delighted with real display michael owen ...
508,sport,time to get tough on friendlies for an intern...
509,sport,davies favours gloucester future wales hooker ...


Here are the available articles from this category, please select an article by its index. 305
Here is the article text:  harinordoquy suffers france axe number eight imanol harinordoquy has been dropped from france s squad for the six nations match with ireland in dublin on 12 march.  harinordoquy was a second-half replacement in last saturday s 24-18 defeat to wales. bourgoin lock pascal pape  who has recovered from a sprained ankle  returns to the 22-man squad. wing cedric heymans and ludovic valbon come in for aurelien rougerie and jean-philippe grandclaude.  rougerie hurt his chest against wales while grandclaude was a second-half replacement against both england and wales. valbon  capped in last june s tests against the united states and canada  was a second half replacement in the win over scotland.  france coach bernard laporte said harinordoquy had been axed after a poor display last weekend.  imanol has been dropped from the squad because the least i can say is that he didn t