### 0. Import Library

In [1]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
import pandas as pd
import numpy as np
from nltk.cluster.kmeans import KMeansClusterer
from sklearn.feature_extraction.text import TfidfVectorizer

### 1. Processing Text

In [3]:
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech


def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  return normalized

### 2. Import and clean the data

In [4]:
news_df = pd.read_csv('train_test_set2.csv',encoding='latin-1')
X = news_df.Article.apply(preprocess_text)
y = news_df.NewsType

### 3. Transforms text to feature vectors

In [5]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

### 4. Text clustering with KMeans

In [6]:
range_n_clusters = [2, 3, 5]
for n_clusters in range_n_clusters:
    kclusterer = KMeansClusterer(num_means=n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kclusterer.cluster(X_tfidf.toarray(),assign_clusters=True)
    print("For n_clusters =", n_clusters,
         "\nThe means in clusters are :", kclusterer.means(),
         )

For n_clusters = 2 
The means in clusters are : [array([1.23825001e-02, 5.08179922e-04, 8.61327490e-05, ...,
       0.00000000e+00, 2.78828636e-03, 8.98243016e-04]), array([0.0038227 , 0.        , 0.        , ..., 0.00247791, 0.        ,
       0.        ])]
For n_clusters = 3 
The means in clusters are : [array([0.00914082, 0.00066931, 0.00011344, ..., 0.        , 0.00367238,
       0.        ]), array([0.01043051, 0.        , 0.        , ..., 0.        , 0.        ,
       0.00110239]), array([0.00209836, 0.        , 0.        , ..., 0.00532292, 0.        ,
       0.        ])]
For n_clusters = 5 
The means in clusters are : [array([0.00450217, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.01643209, 0.00070363, 0.        , ..., 0.        , 0.0038607 ,
       0.00124372]), array([0.00396547, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.00335982, 0.        , 0.00031008, ..., 0.        , 0.        ,
       0.    