In [1]:
import string
import collections
 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

import pandas as pd

In [2]:
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens

In [3]:
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering
 

In [9]:
df = pd.read_csv('df_eng_songs.csv', header=0, index_col=0)
df.head(8)

Unnamed: 0,Tytuł,Tekst,Eng
0,Abba Ojcze,Ty wyzwoliłeś nas Panie z kajdan i samych sie...,"You delivered us from chains and ourselves, an..."
1,Alleluja (Niech zabrzmi Panu),"Alleluja, Alleluja, Alleluja, Alleluja. Nie...","Alleluia, Alleluia, Alleluia, Alleluia. Let th..."
2,"Alleluja, Alleluja, Amen Amen, Alleluja","Alleluja, Alleluja, Amen, Amen, Alleluja. ...","Alleluia, Alleluia, Amen, Amen, Alleluia. Let ..."
3,"Blisko, blisko, blisko jesteś","Blisko, blisko, blisko Jesteś Panie mój Blisk...","Close, close, close You are my Lord, Close to ..."
4,Bo góry mogą ustąpić,Bo góry mogą ustąpić i pagórki się zachwiać. ...,Because the mountains can give way and the hil...
5,Bo jak śmierć potężna jest miłość,Bo jak śmierć potężna jest Miłość A zazdrość ...,For how death is powerful Love and jealousy ar...
6,Boże Twa łaska,"Boże Twa łaska nad nami jest, Twoja miłość pr...","God, your grace is upon us, your love is comin..."
7,Była cicha i piękna jak wiosna,"Była cicha i piękna jak wiosna, Żyła prosto, ...","She was as quiet and beautiful as spring, she ..."


In [7]:
articles = list(df['Eng'])
clusters = cluster_texts(articles, 4)
pprint(dict(clusters))

  'stop_words.' % sorted(inconsistent))


{0: [4,
     5,
     9,
     31,
     57,
     59,
     60,
     61,
     62,
     63,
     66,
     73,
     77,
     80,
     83,
     87,
     91,
     96,
     100,
     119,
     122,
     123,
     125,
     127,
     157,
     161,
     165,
     174,
     185],
 1: [0,
     1,
     3,
     8,
     10,
     11,
     12,
     13,
     15,
     16,
     18,
     20,
     21,
     22,
     23,
     24,
     25,
     26,
     28,
     37,
     41,
     43,
     44,
     46,
     49,
     51,
     54,
     64,
     65,
     67,
     68,
     70,
     71,
     72,
     74,
     76,
     79,
     82,
     85,
     86,
     88,
     89,
     90,
     98,
     102,
     107,
     110,
     113,
     114,
     115,
     116,
     120,
     121,
     129,
     131,
     134,
     135,
     136,
     144,
     145,
     147,
     148,
     149,
     151,
     153,
     154,
     155,
     156,
     159,
     163,
     164,
     167,
     169,
     170,
     172,
     173,
     175,
     176

In [8]:
df.iloc[7]['Eng']

'She was as quiet and beautiful as spring, she lived simply, just like us. She brought God to the world, and in the world new days were shining with tears. The mother who understands everything is overwhelmed by each one of us. Mother can see good in us, she is with us at all times. Today, the world needs goodness, To anxiety to overcome and evil. You need warmth, what life will goldeniate, You need God, so people, let us not Him, just like Her. The mother who understands everything is overwhelmed by each one of us. Mother can see good in us, she is with us at all times.'