In [13]:
import pandas as pd
import numpy as np
import nltk
import hazm
from hazm import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import FactorAnalysis, PCA, KernelPCA
import pickle
import umap

In [None]:
data = pd.read_csv('data/per.csv')
data.head()

In [None]:
with open('data/stopwords.txt') as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [line.replace('\n', '') for line in stopwords]
stopwords

In [None]:
nltk.download('stopwords')

In [None]:
nltk_stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(nltk_stopwords)
len(stopwords)

In [None]:
stemmer = hazm.Stemmer()

In [None]:
stemmer.stem('کتاب‌ها')

In [None]:
dataset = pd.DataFrame(columns=('title_body', 'category'))
for index, row in data.iterrows():
    title_body = row['Title'] + ' ' + row['Body']
    title_body_tokenized = word_tokenize(title_body)
    title_body_tokenized_filtered = [w for w in title_body_tokenized if not w in stopwords]
    title_body_tokenized_filtered_stemmed = [stemmer.stem(w) for w in title_body_tokenized_filtered]
    dataset.loc[index] = {
        'title_body': ' '.join(title_body_tokenized_filtered_stemmed),
        'category': row['Category2'].replace('\n', '')
    }

In [None]:
dataset.head()

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset['title_body'])

In [None]:
X = vectorizer.transform(dataset['title_body'])

In [None]:
X

In [None]:
np.random.seed(1232)

In [None]:
sample_index = np.random.randint(X.shape[0], size=1000)

In [None]:
X = X[sample_index, :]

In [None]:
X = X[:, np.random.randint(X.shape[1], size=1000)]

In [None]:
X.shape

In [None]:
le = LabelEncoder()
y = le.fit_transform(dataset['category'])

In [None]:
le.classes_

In [None]:
y = y[sample_index]

In [None]:
len(y)

In [None]:
print(np.unique(dataset['category']))

In [None]:
np.shape(X)

In [None]:
np.shape(y)

In [None]:
with open('x.txt', 'wb') as f:
    pickle.dump(X, f)

In [None]:
with open('y.txt', 'wb') as f:
    pickle.dump(y, f)

In [2]:
with open('x.txt', 'rb') as f:
    x = pickle.load(f)

In [3]:
with open('y.txt', 'rb') as f:
    y = pickle.load(f)

In [4]:
fa = FactorAnalysis(n_components=10)
fa.fit(x.toarray())
x1 = fa.transform(x.toarray())

In [5]:
np.shape(x1)

(1000, 10)

In [10]:
pca = PCA(n_components=10)
x2 = pca.fit_transform(x.toarray())

In [12]:
kpca = KernelPCA(n_components=10)
x3 = kpca.fit_transform(x.toarray())

In [14]:
umap = umap.UMAP(n_components=10)
x4 = umap.fit_transform(x.toarray())

In [15]:
x = np.concatenate([x1, x2, x3, x4], axis=1)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [18]:
svmc = svm.SVC()
svmc.fit(x_train, y_train)

In [19]:
svmc.score(x_test, y_test)

0.092