In [16]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [17]:
df=pd.read_csv('D:/all_topics.csv',encoding='utf-8')
df.head()

Unnamed: 0,text,label
0,سحبت شركة غوغل في خطوة مفاجئة خدمة يوتيوب الشه...,Tech
1,عاد لاعب باريس سان جيرمان البرازيلي نيمار دا ...,Sports
2,أخبارنا المغربية ـ متابعة في أول رد للبرلمان...,Politics
3,شيرين تتهرب من المؤتمر الصحفي لمهرجان تطوان و ...,Culture
4,منظمو أولمبياد طوكيو 2020 يطالبون بالتبرع باله...,Sports


In [18]:
#preprocessing steps (Normalization of data)
def normalization(txt):
    #Tokenization
    tokenText = txt.split(' ')
    
    #StopWord Removing
    SWList = []
    stopWordList = open(r"D:/New/stopwords.txt" , encoding='utf-8').read().splitlines()
    for token in tokenText:
        if token not in stopWordList:
            SWList.append(token)
    SWText = ' '.join(SWList)
    
    # Punctuations Removing
    import unicodedata as ud
    puncText = ''.join(c for c in SWText if not ud.category(c).startswith('P'))
    
    # ISRI Stemmer (Root-based stemmer)
    import nltk
    st = nltk.ISRIStemmer()
    tkText = puncText.split(' ')
    result = ' '.join([st.stem(w) for w in tkText])
    
    return result

In [19]:
df['text'].apply(normalization)
df.head()

Unnamed: 0,text,label
0,سحبت شركة غوغل في خطوة مفاجئة خدمة يوتيوب الشه...,Tech
1,عاد لاعب باريس سان جيرمان البرازيلي نيمار دا ...,Sports
2,أخبارنا المغربية ـ متابعة في أول رد للبرلمان...,Politics
3,شيرين تتهرب من المؤتمر الصحفي لمهرجان تطوان و ...,Culture
4,منظمو أولمبياد طوكيو 2020 يطالبون بالتبرع باله...,Sports


In [None]:
#Feature Extraction

In [20]:
# Features Extraction by TF-IDF
tfidf_vec=TfidfVectorizer()
tfidf_features=tfidf_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(tfidf_features,df['label'],test_size=0.2)

In [21]:
print(x_train.shape)
print(x_test.shape)

(62742, 466853)
(15686, 466853)


In [22]:
# classifier
tfidf_model=MultinomialNB()
tfidf_model.fit(x_train,y_train)

MultinomialNB()

In [23]:
#Testing data
tfidf_predected=tfidf_model.predict(x_test)

accuracy_score(y_test, tfidf_predected)

0.8760678311870458

In [24]:
cm = confusion_matrix(y_test,tfidf_predected)
print(cm)

[[ 528   15   17  587    1  140   41]
 [   2 1163   12  581    1   23   33]
 [   0    6 2533    8    3    1   43]
 [   0   43    5 2737    0    7    3]
 [   0    1   43   94 1406   17   11]
 [   0    1    2   49    2 3028    3]
 [   1    5  103   33    0    7 2347]]


In [26]:
# Features Extraction by Binary Encoding
binary_vec=CountVectorizer(binary=True)
binary_features=binary_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(binary_features,df['label'],test_size=0.2)

In [27]:
# classefier
binary_model=MultinomialNB()
binary_model.fit(x_train,y_train)

MultinomialNB()

In [28]:
# testing data
binary_predicted=binary_model.predict(x_test)
accuracy_score(y_test, binary_predicted)

0.9257937013897743

In [29]:
print(confusion_matrix(binary_predicted,y_test))

[[1140   12    0   18   26   16   11]
 [  41 1496   18  126    3   15   20]
 [   8    6 2521    3   25    3   90]
 [ 158  266    3 2602   34   69   10]
 [   9    1    4    3 1385    1    3]
 [  25   11    3    3    9 3095    5]
 [  18   23   50    1    8    6 2283]]


In [30]:
# Features Extraction by Counting
count_vec=CountVectorizer()
count_features=count_vec.fit_transform(df['text'])

#split data into train and test
x_train,x_test,y_train,y_test=train_test_split(count_features,df['label'],test_size=0.2)

In [31]:
# classefier
count_model=MultinomialNB()
count_model.fit(x_train,y_train)

MultinomialNB()

In [32]:
count_predicted=count_model.predict(x_test)
accuracy_score(y_test, count_predicted)

0.9264949636618641

In [33]:
print(confusion_matrix(count_predicted,y_test))

[[1115   13    6   19   22   16    9]
 [  44 1555   15  162    4   12   36]
 [   8    5 2500    0   18    5   93]
 [ 125  299    8 2576   23   38    9]
 [   5    0    5    4 1447    1    2]
 [  24   11    5    5    8 3113    2]
 [  21   10   49    3    3    6 2227]]


In [34]:
#The Best Way of Feature Extraction  For Topic Classification is Counting

In [13]:
def remove_stopwords(text):
    file=open('D:/New/stopwords.txt','r',encoding='utf-8')
    ss=file.read()
    stop_words=ss.split()
    results=[]
    for word in text.split():
        if word not in stop_words:
            results.append(word)
    return " ".join(results)