In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

In [2]:
# IMPORTS
import csv
import re
import zipfile
import pickle
import os
from io import BytesIO
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sp
plt.style.use('ggplot')

from joblib import Parallel, delayed

import spacy
#python -m spacy download el_core_news_sm -- rin in cmd
nlp = spacy.load("el_core_news_sm")
nlp.max_length = 3000000

import nltk
#nltk.download("punkt")
import string
from nltk import WhitespaceTokenizer

from gensim.models import KeyedVectors, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [46]:
# Read training data
train_domains = list()
y_train = list()
with open("train.txt", "r") as f:
    for line in f:
        l = line.split(",")
        train_domains.append(l[0])
        y_train.append(l[1][:-1])

# Read test data
test_domains = list()
with open("test.txt", "r") as f:
    for line in f:
        l = line.split(",")
        test_domains.append(l[0])

## Text Preprocessing 

In [4]:
def regex(word):
    '''function that removes punctuation and digits from a sequence of words and replaces them with space'''
    #website = [re.sub('^www\.', '', urlparse(i).netloc) for i in wordseq] #keep website
    word = re.sub(r'\S*https?:\S*',' ', word)  #removes website
    word = re.sub(r'\W',' ', word)  #removes punctuation
    word = re.sub(r'\d',' ', word)  #removes digits
    word = re.sub(r"[^A-Ωα-ωάΆώίέήό]", " ", word)
    word = re.sub(r"\s{2,}", " ", word)
    
    return word

def text_cleanup(tokens):
    ''' function that applies all cleaning up methods in text'''
    #GREEK STOPWORDS - list
    greek_stopwords = pd.read_csv('http://archive.aueb.gr:8085/files/stopwords.txt', 
                                  header=None, sep='\t', names=['stop'])
    greek_stopwords = greek_stopwords.stop.values.tolist()

    #replace \n, *#* characters with space and lower all letters
    tokens = tokens.replace("\n"," ").replace("*#*"," ").lower()
    
    #remove punctuation and digits
    tokens = regex(tokens)
    
    #list of words, seperated by whitespaces:
    whitespace_tk = WhitespaceTokenizer()
    tokens = whitespace_tk.tokenize(tokens)
    
    #remove stopwords
    tokens = [word for word in tokens if word not in greek_stopwords and len(word) > 2 and len(word) < 10] 

    #add lemmatizer
    nlp = spacy.load("el_core_news_sm")
    nlp.max_length = len(' '.join(tokens)) + 100
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ if token.lemma_ != "-PRON-" else token for token in doc]
    
    tokens = [word for word in tokens if len(word) > 2 and len(word) < 10]
    return tokens

In [5]:
# TEXT DATA
# Read textual content of webpages of domain names
text = dict()
with zipfile.ZipFile("domains.zip", "r") as zfile:
    for filename in zfile.namelist():
        if re.search(r"\.zip$", filename) is not None:
            zfiledata = BytesIO(zfile.read(filename))
            with zipfile.ZipFile(zfiledata) as zfile2:
                text[filename[:-4]] = ""
                for name2 in zfile2.namelist():
                    file = zfile2.read(name2)
                    text[filename[:-4]] += file.decode("utf16") + " "

#### Text train and test 

In [6]:
#katharismos
def get_text(domains):
    data = list()
    empty = 0
    for domain in domains:
        try:
            data.append(text_cleanup(text[domain]))
        except KeyError:
            empty += 1
            data.append(['empty_str'])
    print('No text:', empty)
    return data

In [7]:
results_train = Parallel(n_jobs=6)(delayed(get_text)(train_domains[site: site+120]) for site in range(0,len(train_domains),120))
results_test = Parallel(n_jobs=6)(delayed(get_text)(test_domains[site: site+120]) for site in range(0,len(test_domains),120))

No text: 10
No text: 12
No text: 18
No text: 19
No text: 15
No text: 20
No text: 20
No text: 17
No text: 23


In [8]:
train_data = [item for sublist in results_train for item in sublist]
test_data = [item for sublist in results_test for item in sublist]

#create dataframes
train = pd.DataFrame({'domain': train_domains, 'text':train_data})
test = pd.DataFrame({'domain': test_domains, 'text':test_data})
all_text = pd.concat([train,test], axis=0).reset_index()

In [9]:
all_text.head()

Unnamed: 0,index,domain,text
0,0,kollintzas.gr,"[καταβολή, διδάκτρων, συνάντηση, δήλωση, δωρεά..."
1,1,naxos.gr,"[συνάντηση, μουσικός, παίζω, τσαμπος, τουμπάκι..."
2,2,auth.gr,"[πρόσκληση, εκδήλωση, τμήμα, πρόσκληση, εκδήλω..."
3,3,kappoutel.gr,"[προ, όντα, έχω, πρόσφατα, προσφορά, προ, όντα..."
4,4,ilioupoli.gr,"[ηλιουπολη, φίλες, φίλος, χριστος, γεννα, μήνυ..."


In [10]:
# Extract vocabulary 
# Map words to integers from 1,...,size of vocabulary
vocab = dict()
for doc in train_data:
    for term in doc:
        if term not in vocab:
            vocab[term] = len(vocab)+1

for doc in test_data:
    for term in doc:
        if term not in vocab:
            vocab[term] = len(vocab)+1

print('Index of term "εισαγωγή":', vocab['εισαγωγή'])
print("Size of the vocabulary:", len(vocab))

Index of term "εισαγωγή": 288
Size of the vocabulary: 187571


#### Doc2Vec 

In [None]:
documents = [TaggedDocument(all_text.text[i], [all_text.domain[i]]) for i in range(0,len(all_text))]
model = Doc2Vec(documents, vector_size=32, window=4, dm=1, hs=0, min_count=2, workers=8, seed=123) 

In [None]:
# Create train matrix and test matrices
X_train_d2v = np.zeros((len(train_domains), 32))
for i,domain in enumerate(train_domains):
    X_train_d2v[i,:] = model.docvecs[domain]
    
# Split training matrix (and labels) into a training and a validation matrix (and labels)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

X_test_d2v = np.zeros((len(test_domains), 32))
for i,domain in enumerate(test_domains):
    X_test_d2v[i,:] = model.docvecs[domain]

In [None]:
#SAVE RESULTS
with open('X_train_text_d2v', 'wb') as f:
    pickle.dump(X_train_d2v, f)
    
with open('X_test_text_d2v', 'wb') as f:
    pickle.dump(X_test_d2v, f)

### Label Encoding

In [14]:
# Compute average length of documents
mean_size_train = np.mean([len(doc) for doc in train_data])
mean_size_test = np.mean([len(doc) for doc in test_data])
print('Average size of training data', mean_size_train)
print('Average size of test data', mean_size_test)

# Compute longest documents
max_size_train = np.max([len(doc) for doc in train_data])
max_size_test = np.max([len(doc) for doc in test_data])
max_size = max(max_size_train, max_size_test)

print('Length of longest document:', max_size)

Average size of training data 4541.723370429253
Average size of test data 4127.020109689214
Length of longest document: 359293


In [15]:
# Set maximum length equal to 1000
max_size = 1000
# Create training matrix - label encoding
X_train = np.zeros((len(train_data), max_size))
for i,doc in enumerate(train_data):
    for j,word in enumerate(doc):
        if j == max_size:
            break
        X_train[i,j] = vocab[word]

# Create test matrix
X_test = np.zeros((len(test_data), max_size))
for i,doc in enumerate(test_data):
    for j,word in enumerate(doc):
        if j == max_size:
            break
        X_test[i,j] = vocab[word]

print("Train matrix dimensionality: ", X_train.shape)
#print("Validation matrix dimensionality: ", X_val.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (1258, 1000)
Test matrix dimensionality:  (547, 1000)


In [16]:
#Save cleaned data to pickle file
with open('X_train_text', 'wb') as f:
    pickle.dump(X_train, f)

with open('X_test_text', 'wb') as f:
    pickle.dump(X_test, f)
    
with open('vocab', 'wb') as f:
    pickle.dump(vocab, f)

### Tune Classifier with Doc2Vec

In [47]:
# Split training matrix (and labels) into a training and a validation matrix (and labels)
X_train, X_val, y_train, y_val = train_test_split(X_train_d2v, y_train, test_size=0.2, random_state=123)

In [48]:
#GRID SEARCH
clf = LogisticRegression(max_iter=10000)
parameters = [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
              {'penalty':['none', 'l2']},
              {'C':[0.001, 0.01, 0.1, 1, 10, 100]}]

grid_search = GridSearchCV(estimator = clf,  
                           param_grid = parameters,
                           scoring = 'f1_weighted',
                           cv = 5,
                           verbose=0)

grid_search.fit(X_train, y_train)
y_pred = grid_search.predict_proba(X_val)

In [49]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(np.array(list(map(int, y_val))), np.argmax(y_pred, axis=1)))
confusion_matrix(np.array(list(map(int, y_val))), np.argmax(y_pred, axis=1))

              precision    recall  f1-score   support

           0       0.83      0.70      0.76        64
           1       0.83      0.68      0.75        28
           2       0.68      0.57      0.62        37
           3       0.61      0.86      0.72        79
           4       0.67      0.55      0.60        11
           5       0.50      0.60      0.55         5
           6       0.57      0.31      0.40        13
           7       0.33      0.50      0.40         4
           8       1.00      0.20      0.33         5
           9       0.25      0.17      0.20         6

    accuracy                           0.67       252
   macro avg       0.63      0.51      0.53       252
weighted avg       0.69      0.67      0.67       252



array([[45,  1,  2, 14,  0,  0,  1,  0,  0,  1],
       [ 1, 19,  0,  8,  0,  0,  0,  0,  0,  0],
       [ 5,  0, 21,  7,  1,  0,  0,  2,  0,  1],
       [ 1,  2,  5, 68,  0,  1,  1,  1,  0,  0],
       [ 0,  0,  1,  3,  6,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  1,  1,  3,  0,  0,  0,  0],
       [ 1,  0,  0,  4,  1,  2,  4,  0,  0,  1],
       [ 0,  0,  0,  2,  0,  0,  0,  2,  0,  0],
       [ 0,  0,  1,  2,  0,  0,  0,  1,  1,  0],
       [ 1,  1,  1,  2,  0,  0,  0,  0,  0,  1]])

### Final Classifier, Score: 1.10

In [20]:
all_train = np.vstack((X_train, X_val))
all_y = np.concatenate((y_train,y_val))

In [21]:
# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(**grid_search.best_params_, max_iter=10000)
clf.fit(all_train, all_y)
y_pred = clf.predict_proba(X_test_d2v)

In [22]:
y_pred.argmax(1)

array([6, 2, 1, 3, 6, 3, 0, 6, 3, 1, 1, 3, 0, 0, 2, 0, 3, 0, 4, 3, 0, 0,
       9, 3, 4, 3, 1, 1, 3, 3, 1, 2, 0, 2, 3, 3, 3, 3, 0, 2, 2, 6, 3, 0,
       3, 0, 2, 1, 0, 0, 3, 0, 1, 3, 0, 3, 3, 1, 0, 2, 1, 0, 4, 2, 0, 9,
       3, 3, 3, 0, 3, 0, 1, 2, 3, 1, 4, 2, 3, 7, 2, 0, 2, 4, 9, 0, 3, 0,
       6, 0, 0, 0, 0, 5, 0, 0, 4, 4, 0, 3, 2, 3, 4, 3, 3, 2, 3, 5, 5, 5,
       2, 4, 2, 3, 0, 0, 0, 0, 0, 2, 3, 0, 3, 7, 0, 1, 3, 9, 2, 2, 2, 3,
       0, 2, 8, 3, 2, 3, 1, 3, 6, 3, 3, 0, 4, 2, 3, 3, 1, 1, 1, 1, 3, 3,
       3, 5, 3, 0, 3, 4, 2, 0, 3, 3, 3, 3, 3, 2, 1, 3, 0, 7, 3, 0, 0, 5,
       0, 3, 0, 3, 0, 3, 0, 2, 7, 0, 3, 0, 3, 5, 3, 6, 9, 1, 2, 3, 4, 0,
       2, 2, 6, 0, 5, 3, 3, 3, 3, 2, 3, 0, 0, 7, 3, 2, 2, 4, 3, 0, 3, 7,
       3, 0, 3, 0, 2, 0, 0, 3, 0, 1, 3, 0, 0, 0, 3, 2, 3, 7, 0, 3, 3, 0,
       1, 1, 0, 0, 3, 3, 2, 0, 0, 0, 2, 0, 3, 0, 4, 2, 2, 3, 0, 3, 0, 3,
       3, 3, 1, 3, 3, 2, 3, 3, 2, 0, 1, 3, 0, 1, 1, 3, 1, 3, 0, 3, 3, 0,
       1, 3, 2, 0, 2, 2, 4, 3, 5, 0, 3, 3, 0, 3, 0,

In [23]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(10):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_host in enumerate(test_domains):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)