In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def tfidfmatrix(data, vectorizer):
    terms = vectorizer.fit_transform(data)
    return pd.DataFrame(terms.toarray(), columns = vectorizer.get_feature_names())

tfidf_vec = TfidfVectorizer()

In [4]:
liste=["ali veli baba kim", "ali topu tut", "veli aliyi döver", "ali babayı aldı"]

df = pd.DataFrame(liste)
df

Unnamed: 0,0
0,ali veli baba kim
1,ali topu tut
2,veli aliyi döver
3,ali babayı aldı


In [7]:
tfidf_mat = tfidfmatrix(df[0], tfidf_vec)
tfidf_mat

Unnamed: 0,aldı,ali,aliyi,baba,babayı,döver,kim,topu,tut,veli
0,0.0,0.366747,0.0,0.57458,0.0,0.0,0.57458,0.0,0.0,0.453005
1,0.0,0.411378,0.0,0.0,0.0,0.0,0.0,0.644503,0.644503,0.0
2,0.0,0.0,0.617614,0.0,0.0,0.617614,0.0,0.0,0.0,0.486934
3,0.644503,0.411378,0.0,0.0,0.644503,0.0,0.0,0.0,0.0,0.0


In [16]:
top_n = 3
most_common = pd.DataFrame({n: tfidf_mat.T[col].nlargest(top_n).index.tolist() 
                        for n, col in enumerate(tfidf_mat.T)}).T

In [17]:
most_common

Unnamed: 0,0,1,2
0,baba,kim,veli
1,topu,tut,ali
2,aliyi,döver,veli
3,aldı,babayı,ali


In [15]:
df

Unnamed: 0,0
0,ali veli baba kim
1,ali topu tut
2,veli aliyi döver
3,ali babayı aldı


In [18]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1,1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m,sum_t)).reshape(-1,1)
    tf_idf = np.multiply(tf,idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(df[0], m=len(df))

In [26]:
def extract_top_n_words_per_topic(tf_idf,count,df, n=20):
    words = count.get_feature_names()
    labels = list(df[0])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label:[(words[j], tf_idf_transposed[i][j]) for j in indices [i]][::-1] for i, label in enumerate(labels) }
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby([0])
                     .count()
                     .reset_index())
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, df, n = 20)
topic_sizes = extract_topic_sizes(df)
topic_sizes.head(10)

Unnamed: 0,0
0,ali babayı aldı
1,ali topu tut
2,ali veli baba kim
3,veli aliyi döver
