# Nuz Klassifir

### Download NLTK packages

In [126]:
import nltk
#nltk.download()

### Training Data Collection and Preprocessing

In [127]:
import os, re
direk_folders = ['culture', 'diverse', 'economy', 'internationalNews', 'localNews', 'politic', 'society', 'sport', 'technology' ]

In [128]:
train = []
max_samples = 10000

for folder_name in direk_folders:
    direk = 'train/{}/'.format(folder_name)
    current_files_names = os.listdir(direk)
    
    i = 0
    for file_name in current_files_names:
        
        # stop at max number of samples to avoid bias for some classes
        i += 1
        if i == max_samples:
            break
            
        # read file
        file_reader = open(direk + file_name, "r", encoding="utf-8")
        file = file_reader.read()

        # lowercase
        file = file.lower()
        file = file.replace('\r', ' ').replace('\n', ' ')

        # extract title text
        title = re.search(r'<title>(.*?)</(.?)title>', file)
        cleanr = re.compile('<.*?>')
        clean_title = re.sub(cleanr, '', title.group(0))
    
        # extract abstract text
        abstr = re.search(r'<abstract>(.*?)</(.?)abstract>', file)
        clean_abstr = re.sub(cleanr, '', abstr.group(0))

        # extract text of the main content
        text = re.search(r'<text>(.*?)</(.?)text>', file)
        clean_text = re.sub(cleanr, '', text.group(0))
     
        clean_input = clean_title + clean_abstr + clean_text
        train.append([clean_input, folder_name])

### URL, special characters, and digits removal

In [129]:
for row in train:
    row[0] = re.sub(r'http\S+', '', row[0])
    row[0] = re.sub("(\\d|\\W)+"," ", row[0])

### Stemming

In [130]:
from nltk.stem.porter import *
from nltk import word_tokenize
stemmer = PorterStemmer()
#nltk.download('punkt')

In [131]:
for row in train:
    #tokenize
    splitted = word_tokenize(row[0])

    for i in range(0, len(splitted)):
        splitted[i] = stemmer.stem(splitted[i])
    
    row[0] = splitted

### Stop-word removal

In [132]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [133]:
stop_words = set(stopwords.words('arabic')) 
for row in train:
    x = [w for w in row[0] if not w in stop_words]
    row[0] = " ".join(x)

### Feature extraction using term-frequency inverse-document-frequency

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer
from random import shuffle

split_index = int(len(train) * 0.8)

#shuffle(train)

train_sent = [line[0] for line in train]
train_labels = [line[1] for line in train]

#train_sent = [line[0] for line in train[:split_index]]
#tmp_test_sent = [line[0] for line in train[split_index:]]
#train_labels = [line[1] for line in train[:split_index]]
#tmp_test_labels = [line[1] for line in train[split_index:]]

tdif_vec = TfidfVectorizer(norm = None)
tfidf = tdif_vec.fit_transform(train_sent)

8128


### Feed the training documents and their labels to a NB classifier

In [135]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1)
clf.fit(tfidf, train_labels)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

### Model accuracy

In [137]:
#from sklearn import metrics

#tmp_transformed_test = tdif_vec.transform(tmp_test_sent)
#tmp_predictions = clf.predict(tmp_transformed_test)

#print(metrics.f1_score(tmp_test_labels, tmp_predictions, average='micro'))

0.7771765863256271


### Test Data Collection and Preprocessing

In [144]:
import json
json_file = open('test\posts.json', encoding='utf8')
json_reader = json.load(json_file)

user_interest = []

test = []
for user in json_reader.values():
    for article in user:
        #use the title and description as input
        dirty_input = article[0] + article[1]
        
        # lowercase and remove line breaks
        clean_input = dirty_input.lower()
        clean_input = dirty_input.replace('\r', ' ').replace('\n', ' ')

        test.append(clean_input)

# number of data provided for each user is 50, nubmer of users is 2000
#test = test[:100]

### URL, special characters and digits removal

In [145]:
for i in range(0, len(test)):
    test[i] = re.sub(r'http\S+', '', test[i])

    # remove special characters and digits
    test[i] = re.sub("(\\d|\\W)+"," ", test[i])

### Stemming

In [146]:
for i in range(0, len(test)):
    #tokenize
    splitted = word_tokenize(test[i])

    #stem
    for j in range(0, len(splitted)):
        splitted[j] = stemmer.stem(splitted[j])
    
    test[i] = splitted

### Stop-word removal

In [147]:
for i in range(0, len(test)):
    x = [w for w in test[i] if not w in stop_words]
    test[i] = " ".join(x)

### Predict the labels of the testing data

In [148]:
transformed_test = tdif_vec.transform(test)
predictions = clf.predict(transformed_test)

['internationalNews' 'technology' 'internationalNews' ... 'technology'
 'diverse' 'internationalNews']


### Calculate each user interests

In [149]:
articles_per_user = 50
interests = [{}]
user_index = 0
for i in range(0, len(predictions)):
    if predictions[i] not in interests[user_index].keys():
        interests[user_index][predictions[i]] = 0
    interests[user_index][predictions[i]] = interests[user_index][predictions[i]] + 1
    
    if i != 0 and i % articles_per_user == 0:
        user_index += 1
        interests.append({})
        
user_index = 0
for user in interests:
    user_interests = ('User {}:').format(user_index)
    for key, value in user.items():
        user_interests = ('{} {} {}%,').format(user_interests, key, (value*100.00/articles_per_user))
    user_index += 1
    print(user_interests)

User 0: internationalNews 10.0%, technology 22.0%, localNews 10.0%, diverse 18.0%, sport 24.0%, culture 8.0%, politic 4.0%, economy 6.0%,
User 1: culture 20.0%, technology 18.0%, economy 6.0%, localNews 6.0%, sport 18.0%, diverse 20.0%, politic 4.0%, society 4.0%, internationalNews 4.0%,
User 2: localNews 6.0%, sport 34.0%, internationalNews 12.0%, economy 8.0%, diverse 16.0%, culture 8.0%, society 2.0%, technology 12.0%, politic 2.0%,
User 3: culture 16.0%, diverse 16.0%, technology 14.0%, sport 36.0%, politic 4.0%, economy 6.0%, internationalNews 6.0%, localNews 2.0%,
User 4: sport 24.0%, localNews 8.0%, economy 12.0%, internationalNews 12.0%, technology 16.0%, culture 10.0%, diverse 14.0%, society 4.0%,
User 5: internationalNews 20.0%, culture 14.0%, localNews 6.0%, technology 22.0%, diverse 10.0%, sport 20.0%, society 6.0%, politic 2.0%,
User 6: diverse 12.0%, localNews 8.0%, technology 16.0%, culture 16.0%, sport 30.0%, economy 2.0%, internationalNews 14.0%, politic 2.0%,
User 7: 

User 1072: sport 32.0%, economy 6.0%, culture 10.0%, technology 22.0%, internationalNews 18.0%, diverse 10.0%, society 2.0%,
User 1073: technology 6.0%, sport 36.0%, internationalNews 16.0%, diverse 20.0%, economy 10.0%, culture 6.0%, society 2.0%, politic 2.0%, localNews 2.0%,
User 1074: sport 22.0%, technology 24.0%, culture 8.0%, diverse 12.0%, internationalNews 18.0%, economy 10.0%, society 2.0%, politic 4.0%,
User 1075: internationalNews 14.0%, localNews 10.0%, culture 16.0%, politic 2.0%, sport 24.0%, diverse 10.0%, economy 10.0%, technology 14.0%,
User 1076: internationalNews 14.0%, society 2.0%, technology 8.0%, diverse 14.0%, sport 26.0%, localNews 10.0%, culture 10.0%, economy 12.0%, politic 4.0%,
User 1077: sport 34.0%, diverse 32.0%, internationalNews 8.0%, culture 10.0%, technology 4.0%, localNews 6.0%, politic 2.0%, economy 4.0%,
User 1078: culture 20.0%, sport 28.0%, technology 16.0%, diverse 16.0%, localNews 4.0%, politic 6.0%, internationalNews 4.0%, economy 6.0%,
User

User 1572: technology 24.0%, economy 12.0%, diverse 20.0%, sport 24.0%, localNews 2.0%, internationalNews 14.0%, politic 2.0%, culture 2.0%,
User 1573: diverse 6.0%, culture 14.0%, internationalNews 14.0%, sport 36.0%, technology 14.0%, economy 16.0%,
User 1574: culture 16.0%, sport 22.0%, diverse 18.0%, internationalNews 10.0%, economy 10.0%, technology 20.0%, localNews 2.0%, society 2.0%,
User 1575: politic 2.0%, culture 16.0%, sport 30.0%, economy 12.0%, internationalNews 8.0%, technology 12.0%, diverse 16.0%, localNews 4.0%,
User 1576: diverse 24.0%, internationalNews 20.0%, technology 18.0%, sport 24.0%, economy 6.0%, politic 2.0%, culture 4.0%, localNews 2.0%,
User 1577: diverse 14.0%, technology 34.0%, internationalNews 12.0%, sport 30.0%, localNews 2.0%, economy 6.0%, culture 2.0%,
User 1578: internationalNews 12.0%, technology 18.0%, sport 30.0%, economy 8.0%, localNews 4.0%, politic 4.0%, culture 12.0%, diverse 10.0%, society 2.0%,
User 1579: culture 16.0%, technology 26.0%, 