In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
max_words = 15
tf_feat = pd.DataFrame(columns=np.arange(max_words))

for group in range(1, 130):
    train_data = pd.read_csv(f'TO_GROUPS_text/text_group_{group}.csv', encoding='utf-8')
    train_data.fillna('', inplace=True)
    documents = train_data.text
    
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform(documents)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    voc = np.array(df.mean(axis=0).sort_values(ascending=False)[:max_words].index)
    intersect_w = np.zeros((documents.shape[0], max_words))
    for i, title in enumerate(documents):
        words = title.strip().split()
        for j, word in enumerate(voc):
            intersect_w[i, j] = words.count(word)

    tf_feat = tf_feat.append(pd.DataFrame(intersect_w))
    print(f"\r{group} groups are processed...", end='', flush=True)

print('\n', tf_feat.shape)

In [53]:
with open(f'tfidf_texts_train_raw_{group}.csv', mode='w', encoding='utf-8') as f_csv:
    tf_feat.to_csv(f_csv, index=False)

In [5]:
tf_feat.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,48.530612,67.035714,28.678571,24.02551,21.897959,20.688776,29.632653,15.321429,14.622449,17.066327,14.464286,9.535714,5.622449,8.086735,15.112245
std,125.234402,185.970617,77.644712,41.24852,61.076915,61.57547,109.844682,41.564331,35.040494,45.109788,37.2237,15.278399,21.042577,21.914464,36.002103
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,7.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.75
50%,4.0,14.0,9.0,7.0,0.0,1.0,1.5,0.0,1.0,4.0,2.0,4.0,0.0,0.0,4.0
75%,29.0,46.25,25.5,23.0,11.0,11.0,15.5,11.5,10.25,12.0,7.25,10.25,3.0,4.0,12.25
max,945.0,1904.0,765.0,232.0,531.0,575.0,1308.0,436.0,235.0,481.0,304.0,99.0,258.0,140.0,327.0


In [55]:
max_words = 15
tf_feat_test = pd.DataFrame(columns=np.arange(max_words))

for group in range(130, 310):
    test_data = pd.read_csv(f'TO_GROUPS_text/text_group_{group}.csv', encoding='utf-8')
    test_data.fillna('', inplace=True)
    documents = test_data.text
    
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform(documents)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    voc = np.array(df.mean(axis=0).sort_values(ascending=False)[:max_words].index)
    intersect_w = np.zeros((documents.shape[0], max_words))
    for i, title in enumerate(documents):
        words = title.strip().split()
        for j, word in enumerate(voc):
            intersect_w[i, j] = words.count(word)

    tf_feat_test = tf_feat_test.append(pd.DataFrame(intersect_w))
    print(f"\r{group} groups are processed...", end='', flush=True)

print('\n', tf_feat.shape)

(16627, 15)


In [56]:
with open(f'tfidf_texts_train_raw_{group}.csv', mode='w', encoding='utf-8') as f_csv:
    tf_feat_test.to_csv(f_csv, index=False)

In [57]:
tf_feat_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0
mean,0.430625,0.326758,0.252962,0.203224,0.16786,0.142479,0.115655,0.096349,0.093583,0.081614,0.072773,0.066157,0.065857,0.063451,0.057978
std,0.670238,0.664425,0.506308,0.464829,0.409976,0.435098,0.343049,0.338376,0.380758,0.330517,0.328664,0.261994,0.323959,0.300973,0.257231
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.0,16.0,14.0,13.0,7.0,15.0,7.0,14.0,15.0,13.0,13.0,4.0,15.0,11.0,6.0
