In [1]:
import numpy as np
import pandas as pd

In [2]:
f = open('persica.csv', "r", encoding='utf-8')
lines = f.readlines()

title = []
text=[]
category2=[]

for i,line in enumerate(lines):
    if i%7 == 1:
        title.append(line.strip())
    elif i%7 == 2:
        text.append(line.strip())
    elif i%7 == 6:
        category2.append(line.strip())

df = pd.DataFrame()
df['title'] = title
df['text'] = text
df['input_text'] = df['title'] + df['text']
df['category'] = category2

In [3]:
df['category'].unique()

array(['آموزشي', 'اجتماعي', 'تاريخي', 'اقتصادي', 'بهداشتي', 'علمي',
       'سياسي', 'فرهنگي', 'فقه و حقوق', 'مذهبي', 'ورزشي'], dtype=object)

In [4]:
df.head()

Unnamed: 0,title,text,input_text,category
0,وزير علوم درجمع استادان نمونه: سن بازنشستگي اس...,وزير علوم در جمع استادان نمونه كشور گفت: از اس...,وزير علوم درجمع استادان نمونه: سن بازنشستگي اس...,آموزشي
1,گردهمايي دانش‌آموختگان موسسه آموزش عالي سوره ب...,به گزارش سرويس صنفي آموزشي خبرگزاري دانشجويان ...,گردهمايي دانش‌آموختگان موسسه آموزش عالي سوره ب...,آموزشي
2,نتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور ا...,نتايج آزمون دوره‌هاي فراگير مقاطع كارشناسي و ك...,نتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور ا...,آموزشي
3,همايش يكروزه آسيب شناسي مفهوم روابط عمومي در ب...,",",همايش يكروزه آسيب شناسي مفهوم روابط عمومي در ب...,اجتماعي
4,وضعيت اقتصادي و ميزان تحصيلات والدين از مهمتري...,محمدتقي علوي يزدي، مجري اين طرح پژوهشي در اين‌...,وضعيت اقتصادي و ميزان تحصيلات والدين از مهمتري...,آموزشي


In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['input_text'], df['category'], test_size=0.2, random_state=1)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import Normalizer,MinMaxScaler


count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

lsa = TruncatedSVD(100)
x_train_lsa = lsa.fit_transform(x_train_tfidf)
scaler1 = Normalizer()
x_train_lsa = scaler1.fit_transform(x_train_lsa)
scaler2 = MinMaxScaler()
x_train_lsa = scaler2.fit_transform(x_train_lsa)

print(x_train_counts.shape)
print(x_train_lsa.shape)

(8799, 60631)
(8799, 100)


## Naive Bayes

In [7]:
clf_Naive_Bayes = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('svd', TruncatedSVD(100)),
                            ('scaler1', Normalizer()),
                            ('scaler2', MinMaxScaler()),
                            ('clf', MultinomialNB()),])
clf_Naive_Bayes.fit(x_train, y_train)
y_predicted = clf_Naive_Bayes.predict(x_test)
precision_recall_fscore_support(y_predicted, y_test, average='macro')

(0.7713532398333658, 0.77768886966399, 0.7711927078241542, None)

In [8]:
from sklearn.model_selection import GridSearchCV

parameters = {'svd__n_components': [100,200,300],}
gs_clf = GridSearchCV(clf_Naive_Bayes, parameters)
gs_clf = gs_clf.fit(x_train, y_train)
print(gs_clf.best_score_)
gs_clf.best_params_

0.8058853170706495


{'svd__n_components': 300}

## Perceptron

In [9]:
from sklearn.linear_model import Perceptron

In [10]:
clf_Perceptron = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('svd', TruncatedSVD(100)),
                            ('scaler', Normalizer()),
                            ('clf', Perceptron()),])
clf_Perceptron.fit(x_train, y_train)
y_predicted = clf_Perceptron.predict(x_test)
precision_recall_fscore_support(y_predicted, y_test, average='macro')

(0.7605282352126398, 0.7831978926029959, 0.7565712095455562, None)

In [11]:
parameters = {'svd__n_components': [100,200,300],}
gs_clf = GridSearchCV(clf_Perceptron, parameters)
gs_clf = gs_clf.fit(x_train, y_train)
print(gs_clf.best_score_)
gs_clf.best_params_

0.7852047909452685


{'svd__n_components': 300}

## SVM

In [12]:
from sklearn.svm import SVC

In [13]:
clf_svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('svd', TruncatedSVD(100)),
                    ('scaler', Normalizer()),
                    ('clf', SVC(kernel='rbf',random_state=42)),])
clf_svm.fit(x_train, y_train)
y_predicted = clf_svm.predict(x_test)
precision_recall_fscore_support(y_predicted, y_test, average='macro')

(0.8230442980895458, 0.8202878986757439, 0.8199557325034337, None)

In [14]:
parameters = {'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'svd__n_components': [200,300],}
gs_clf = GridSearchCV(clf_svm, parameters)
gs_clf = gs_clf.fit(x_train, y_train)
print(gs_clf.best_score_)
gs_clf.best_params_

0.8366852938136337


{'clf__kernel': 'rbf', 'svd__n_components': 300}