In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import contractions

from tqdm import tqdm
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance

# KMeans

Below is an implemented KMeans algorithm on Word2Vec embeddings of the WikiHow article data. KMeans is an unsupervised learning algorithm, meaning we do not care about the "Summary" columns during training. The pipeline is summarized as follows:
- Preprocess Text (Remove stopwords, short words, punctuation, etc.)
- Tokenize Data
- Embed data into Word2Vec
- Pass embedded vectors into KMeans

In [4]:
""" Here we set text to lower case, remove plurals, 
    expand contractions, remove punctuation, remove stopwords, and remove short words 
    (could also remove parentheticals)"""

stop = set(stopwords.words('english'))
def clean_text(text):
    ret = text.lower()
    ret = contractions.fix(text)
    ret = re.sub(r'\([^)]*\)', '', ret)
    ret = re.sub('"','', ret)
    ret = re.sub(r"'s\b","", ret)
    ret = re.sub("[^a-zA-Z]", " ", ret) 
    
    #Remove any words shorter than 2 letters
    tokens = [w for w in ret.split() if not w in stop]
    long_words=[]
    for i in tokens:
        if len(i)>=3:                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [7]:
#Loading Data
data = pd.read_csv('data/cleaned_data.csv')
data = data[['Summary', 'Text']]

#Create corpus for TF-IDF of summaries
corpus = data["Text"]
corpus = corpus.apply(clean_text)

#Create Sample Text Doc
test = data['Text'][0]
text = test.replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')

In [22]:
test = data['Text'][0]
label = data['Summary'][0]
text = test.replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')
lbl = label.replace('\\', '').replace('/', '').replace('.,', '. ').replace('.;,', '. ')

In [9]:
sentence = sent_tokenize(text)
clean = []
for sen in sentence:
    clean.append(clean_text(sen))

In [10]:
#Word2Vec
from gensim.models import Word2Vec
from nltk.corpus import stopwords

words = []
all_words = [i.split() for i in clean]
model = Word2Vec(all_words, min_count=1, vector_size=300)

sent_vector=[]
for i in clean:
    plus=0
    for j in i.split():
        plus+= model.wv[j]
    plus = plus/len(i.split())
    sent_vector.append(plus)

The choice of using 6 clusters comes from the average length of the summaries. We cannot tune this parameter because it directly affects our ROUGE-1 score, i.e. larger summaries increase the likelihood of overlap.

In [35]:
#Performing KMeans with 5 Clusters
n_clusters = 6
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(sent_vector)

my_list=[]
for i in range(n_clusters):
    my_dict={}
    
    for j in range(len(y_kmeans)):
        
        if y_kmeans[j]==i:
            my_dict[j] = distance.euclidean(kmeans.cluster_centers_[i],sent_vector[j])
    min_distance = min(my_dict.values())
    my_list.append(min(my_dict, key=my_dict.get))

result = ""
for i in sorted(my_list):
    result += sentence[i] + " "
print(result)

clean each night. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying.Even if the rest of your studio is a bit disorganized, an organized workspace will help you get down to business every time you want to make art. You can even use it for smaller areas. Once a month, do a purge of your studio. Artists are constantly making new things, experimenting, and making a mess. Toss it. 


# Scoring

For scoring, I first extract the keywords in the output from KMeans and the human-written summary (the label). Then I look at the overlap of those keywords to determine how many match -- this will give us an accuracy and precision score, and thus an F1 score. I will also calculate the ROUGE1 score between both the concatenated list of keywords and the summaries themselves.

In [23]:
class TF_IDF():
    def __init__(self, corpus):
        self.text = corpus
        self.stopwords = set(stopwords.words("english"))
        self.cv = CountVectorizer(max_df=0.85, stop_words=self.stopwords)
        self.wordcount = self.cv.fit_transform(corpus)
    
        self.transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
        self.transformer.fit(self.wordcount)
    
    def sort_vals(self, matrix):
        tuples = zip(matrix.col, matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    
    def extract_top_k(self, feature_names, items, k=10):
        items = items[:k]

        scores = []
        features = []

        for idx, score in items:
            scores.append(round(score, 3))
            features.append(feature_names[idx])
        
        results = {}
        for idx in range(len(features)):
            results[features[idx]] = scores[idx]
        
        return results

    def extract_keywords(self, doc, k=10):
        feature_names = self.cv.get_feature_names()
        tf_idf_vector = self.transformer.transform(self.cv.transform([doc]))

        sort_items = self.sort_vals(tf_idf_vector.tocoo())
        keywords = self.extract_top_k(feature_names, sort_items, k)

        print("\nDocument")
        print(doc)
        print("\nKeywords")
        for k in keywords:
            print(k, keywords[k])
        
        return keywords

In [36]:
tf = TF_IDF(corpus)
res_keyword = tf.extract_keywords(result, 10)
lbl_keyword = tf.extract_keywords(lbl, 10)




Document
clean each night. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying.Even if the rest of your studio is a bit disorganized, an organized workspace will help you get down to business every time you want to make art. You can even use it for smaller areas. Once a month, do a purge of your studio. Artists are constantly making new things, experimenting, and making a mess. Toss it. 

Keywords
workspace 0.42
studio 0.385
tidying 0.252
purge 0.245
disorganized 0.233
endeavor 0.223
experimenting 0.205
artists 0.179
making 0.166
toss 0.163

Document
Keep related supplies in the same area. Make an effort to clean a dedicated workspace after every session. Place loose supplies in large, clearly visible containers. Use clotheslines and clips to hang sketches, photos, and reference material. Use every inch of the room for storage, especially vertical space. Use chalkboard paint to make space f

In [37]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(result,
                      label)

In [38]:
print(result)
print("\n")
print(lbl)

clean each night. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying.Even if the rest of your studio is a bit disorganized, an organized workspace will help you get down to business every time you want to make art. You can even use it for smaller areas. Once a month, do a purge of your studio. Artists are constantly making new things, experimenting, and making a mess. Toss it. 


Keep related supplies in the same area. Make an effort to clean a dedicated workspace after every session. Place loose supplies in large, clearly visible containers. Use clotheslines and clips to hang sketches, photos, and reference material. Use every inch of the room for storage, especially vertical space. Use chalkboard paint to make space for drafting ideas right on the walls. Purchase a label maker to make your organization strategy semi-permanent. Make a habit of throwing out old, excess, or useless stuff eac

In [39]:
scores

{'rouge1': Score(precision=0.35294117647058826, recall=0.33707865168539325, fmeasure=0.3448275862068966),
 'rougeL': Score(precision=0.16470588235294117, recall=0.15730337078651685, fmeasure=0.16091954022988506)}