In [1]:
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def word_tokenizer(text):
    #tokenizes and stems the text
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
    return tokens

In [3]:
def cluster_sentences(sentences, nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                            stop_words=stopwords.words('english'),
                                            max_df=0.9,
                                            min_df=0.001,
                                            lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    kmeans = KMeans(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)
    return dict(clusters)

In [7]:
sentences = ["Nature is beautiful","I like green apples",
                    "We should protect the trees","Fruit trees provide fruits",
                    "Green apples are tasty"]
nclusters= 3
clusters = cluster_sentences(sentences, nclusters)
for cluster in range(nclusters):
    print ("cluster ",cluster,":")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sentences[sentence])

  'stop_words.' % sorted(inconsistent))


cluster  0 :
	sentence  0 :  Nature is beautiful
cluster  1 :
	sentence  0 :  We should protect the trees
	sentence  1 :  Fruit trees provide fruits
cluster  2 :
	sentence  0 :  I like green apples
	sentence  1 :  Green apples are tasty


In [8]:
import pandas as pd
df = pd.read_csv('aita_clean.csv')
df.head()

Unnamed: 0,id,timestamp,title,body,edited,verdict,score,num_comments,is_asshole
0,arfcfl,1550365421,AITA - Getting Hugged At The Bar,\n\nThis all happened less than an hour ago.\...,False,not the asshole,3,14,0
1,cg8mxn,1563769917,AITA if i don't want to pay my friend 5 dollar...,"So, my friend bought herself, our other friend...",False,asshole,11,42,1
2,bajsje,1554663842,AITA For going to 'expensive' restaurants in s...,It's in Seattle if that matters. It seems peop...,1554686830.0,asshole,657,397,1
3,cq6voc,1565771883,"AITA for wanting ""free gas"", due to an attenda...",I was with my dad and driving the family car w...,1565773989.0,not the asshole,49,40,0
4,bz4m2k,1560207411,AITA I asked my dad to replace the fuel that h...,Am I the asshole for asking my dad to replace ...,False,not the asshole,4,10,0


In [9]:
text_only = df['title']+' \n '+df['body']

In [10]:
type(text_only)

pandas.core.series.Series

In [15]:
text_only_list = text_only.values.tolist()

In [18]:
nclusters= 5
sublist = text_only_list[:100]
clusters = cluster_sentences(sublist, nclusters)
for cluster in range(nclusters):
    print ("cluster ",cluster,":")
    for i,sentence in enumerate(clusters[cluster]):
        print ("\tsentence ",i,": ",sublist[sentence])

  'stop_words.' % sorted(inconsistent))


cluster  0 :
	sentence  0 :  WIBTA if i leave an internshipe after only 2 days because of their inability to commit to the working hours? 
 I just got a 1 month internshipe (unpaid) and its quite far distance from my home but I really wanted to get experience so i accepted i went on the first day and i noticed a general vibes of unprofessionalism but I though it's just in my head. So today I'm supposed to be there before 12 pm and I'm there at 11:30 am but there is literally no one at the company. I'm upset because I invested so much time, effort and money and the whole company can't commit to opening hours. So i left and decided that I'm not suited for this internship. A friend told me that by leaving after only one day I'm not committing to my word and giving up too easy.
	sentence  1 :  AITA for not spending an entire day with my mother in law, every weekend? 
 My husband and I have been married for 5 years. He has a very close relationship with his mother, as he is the only child a