# Nuz Klassifir

### Download NLTK packages

In [70]:
import nltk
#nltk.download()

### Training Data Collection and Preprocessing

In [71]:
import os, re
direk_folders = ['culture', 'diverse', 'economy', 'internationalNews', 'localNews', 'politic', 'society', 'sport', 'technology' ]

In [72]:
train = []
max_samples = 100

for folder_name in direk_folders:
    direk = 'train/{}/'.format(folder_name)
    current_files_names = os.listdir(direk)
    
    i = 0
    for file_name in current_files_names:
        
        # stop at max number of samples to avoid bias for some classes
        i += 1
        if i == max_samples:
            break
            
        # read file
        file_reader = open(direk + file_name, "r", encoding="utf-8")
        file = file_reader.read()

        # lowercase
        file = file.lower()
        file = file.replace('\r', ' ').replace('\n', ' ')

        # extract title text
        title = re.search(r'<title>(.*?)</(.?)title>', file)
        cleanr = re.compile('<.*?>')
        clean_title = re.sub(cleanr, '', title.group(0))
    
        # extract abstract text
        abstr = re.search(r'<abstract>(.*?)</(.?)abstract>', file)
        clean_abstr = re.sub(cleanr, '', abstr.group(0))

        # extract text of the main content
        text = re.search(r'<text>(.*?)</(.?)text>', file)
        clean_text = re.sub(cleanr, '', text.group(0))

        # remove special characters and digits
        clean_title = re.sub("(\\d|\\W)+"," ", clean_title)
        clean_abstr = re.sub("(\\d|\\W)+"," ", clean_abstr)
        clean_text = re.sub("(\\d|\\W)+"," ", clean_text)
        
        clean_input = clean_title + clean_abstr + clean_text
        train.append([clean_input, folder_name])

### Stemming

In [73]:
from nltk.stem.porter import *
from nltk import word_tokenize
stemmer = PorterStemmer()
#nltk.download('punkt')

In [74]:
for row in train:
    #tokenize
    splitted = word_tokenize(row[0])

    for i in range(0, len(splitted)):
        splitted[i] = stemmer.stem(splitted[i])
    
    row[0] = splitted

### Stop-word removal

In [75]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [76]:
stop_words = set(stopwords.words('arabic')) 
for row in train:
    x = [w for w in row[0] if not w in stop_words]
    row[0] = x

### URL removal

In [77]:
for row in train:
    splitted = row[0]
    row[0] = " ".join(splitted)
    row[0] = re.sub(r'http\S+', '', row[0])

### Model

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_sent = [line[0] for line in train]
train_labels = [line[1] for line in train]

tdif_vec = TfidfVectorizer(norm = None)
tfidf = tdif_vec.fit_transform(train_sent)

### Feed the training documents and their labels to a NB classifier

In [79]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1)
clf.fit(tfidf, train_labels)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

### Test Data Collection and Preprocessing

In [81]:
import json
json_file = open('test\posts.json', encoding='utf8')
json_reader = json.load(json_file)

user_interest = []

test = []
for user in json_reader.values():
    for article in user:
        #use the title and description as input
        dirty_input = article[0] + article[1]
        
        # lowercase and remove line breaks
        dirty_input = dirty_input.lower()
        dirty_input = dirty_input.replace('\r', ' ').replace('\n', ' ')
        
        test.append(clean_input)

# number of data provided for each user is 50, nubmer of users is 2000
test = test[:100]

### Stemming

In [82]:
for i in range(0, len(test)):
    #tokenize
    splitted = word_tokenize(test[i])

    #stem
    for j in range(0, len(splitted)):
        splitted[j] = stemmer.stem(splitted[j])
    
    test[i] = splitted

### Stop-word removal

In [83]:
for i in range(0, len(test)):
    x = [w for w in test[i] if not w in stop_words]
    test[i] = x

### URL removal

In [84]:
for i in range(0, len(test)):
    splitted = test[i]
    cell = " ".join(splitted)
    test[i] = re.sub(r'http\S+', '', cell)

### Predict the labels of the testing data

In [85]:
transformed_test = tdif_vec.transform(test)
predictions = clf.predict(transformed_test)
print(predictions)
print(test[7])

['technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technology' 'technology' 'technology' 'technology' 'technology'
 'technolo

### Calculate each user interests

In [86]:
articles_per_user = 50
interests = [{}]
user_index = 0
for i in range(0, len(predictions)):
    if predictions[i] not in interests[user_index].keys():
        interests[user_index][predictions[i]] = 0
    interests[user_index][predictions[i]] = interests[user_index][predictions[i]] + 1
    
    if i != 0 and i % articles_per_user == 0:
        user_index += 1
        interests.append({})
        
user_index = 0
for user in interests:
    user_interests = ('User {}:').format(user_index)
    for key, value in user.items():
        user_interests = ('{} {} {}%,').format(user_interests, key, (value*100.00/articles_per_user))
    user_index += 1
    print(user_interests)

User 0: technology 102.0%,
User 1: technology 98.0%,
