#Functions to read devset and evalset and labels conversion

In [19]:
import csv
import numpy as np

def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))


In [20]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

41077

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True)

In [22]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text) # remove email
    text = re.sub('@[^\s]+', 'at', text) # remove eventually @<word>
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove eventually hashtag 
    text = re.sub(r'[#@]', " ", text) # remove eventually isolated @#
    text = re.sub("'", "' ", text)  #insert a space in every word with the <'> in order to be catched by tokenizer and filtered
    text = re.sub(r"([.]*)([0-9]+)([.]*)", r"\0 \1", text) # i.e ciao123come20va -> ciao come va
    text = re.sub("([*.\-/+])", " ", text) # remove special characters from words of kind "..<word>" "***" "-<word>" "..<word>"
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t, )
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma):
                lemmas.append(lemma)
        return lemmas


In [23]:
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords as sw
lemmaTokenizer = LemmaTokenizer()
stopwords = sw.words('italian') + ['quantum', "url", "at", "email", 'avra', 'avro', 'fara', 'faro', 'perche', 'sara', 'saro', 'stara', 'staro']
params = {
    "input": ["content"],
    "tokenizer": [lemmaTokenizer],
    "stop_words": [stopwords],
    "binary": [True],
    "min_df": [1, 2, 3],
    "ngram_range": [(1,1), (1,2)]
}
configs = ParameterGrid(params)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizers = []
X_train_sets = []
X_test_sets = []
for conf in configs:
    vectorizer = TfidfVectorizer(**conf)
    X_train_sets.append(vectorizer.fit_transform(X_train, y_train))
    X_test_sets.append(vectorizer.transform(X_test))
    vectorizers.append(vectorizer)

In [25]:
for conf, vec in zip(configs, vectorizers):
    print(f"{conf['ngram_range']}: {len(vec.vocabulary_)}")
    

(1, 1): 44725
(1, 2): 712294
(1, 1): 23620
(1, 2): 149596
(1, 1): 17900
(1, 2): 81779


In [26]:
for vec, train, test in zip(vectorizers,X_train_sets, X_test_sets):
    print("Vocabulary size: {}".format(len(vec.vocabulary_)))
    print("X_train:\n{}".format(repr(train)))
    print("X_test: \n{}".format(repr(test)))
    

Vocabulary size: 44725
X_train:
<21565x44725 sparse matrix of type '<class 'numpy.float64'>'
	with 1155377 stored elements in Compressed Sparse Row format>
X_test: 
<7189x44725 sparse matrix of type '<class 'numpy.float64'>'
	with 377769 stored elements in Compressed Sparse Row format>
Vocabulary size: 712294
X_train:
<21565x712294 sparse matrix of type '<class 'numpy.float64'>'
	with 2421049 stored elements in Compressed Sparse Row format>
X_test: 
<7189x712294 sparse matrix of type '<class 'numpy.float64'>'
	with 620489 stored elements in Compressed Sparse Row format>
Vocabulary size: 23620
X_train:
<21565x23620 sparse matrix of type '<class 'numpy.float64'>'
	with 1134272 stored elements in Compressed Sparse Row format>
X_test: 
<7189x23620 sparse matrix of type '<class 'numpy.float64'>'
	with 374029 stored elements in Compressed Sparse Row format>
Vocabulary size: 149596
X_train:
<21565x149596 sparse matrix of type '<class 'numpy.float64'>'
	with 1858351 stored elements in Compress

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grids = []
param_grid = {'C': [0.1, 1, 10], 'max_iter': [200], 'multi_class': ['ovr'], 'n_jobs': [4]}
for train, vec in zip(X_train_sets, vectorizers):
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1')
    grid.fit(train, y_train)
    grids.append(grid)
    

In [28]:
scores = list(map(lambda x: x.best_score_, grids))
print(f"Best f1 score: {np.max(scores)}")
best_params = grids[int(np.argmax(scores))].best_params_
print(f"Best parameters: {best_params}")
best_estimator = grids[int(np.argmax(scores))].best_estimator_
print(f"Best estimator: {best_estimator}")
best_vectorizer_params = vectorizers[int(np.argmax(scores))].get_params()

Best f1 score: 0.9729723907261889
Best parameters: {'C': 10000, 'class_weight': 'balanced', 'max_iter': 200, 'multi_class': 'ovr', 'n_jobs': 4}
Best estimator: LogisticRegression(C=10000, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=200, multi_class='ovr', n_jobs=4, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


FINAL TRAIN

In [29]:
best_vectorizer_params['strip_accents'] = 'unicode' #Added now because of too much computational time need to find best params
vectorizer = TfidfVectorizer(**best_vectorizer_params)
X_tfidf_train = vectorizer.fit_transform(X, y)
X_tfidf_test = vectorizer.transform(X_ev)

clf = LogisticRegression(**best_params)
clf.fit(X_tfidf_train, y)
y_pred = clf.predict(X_tfidf_test)


In [30]:
with open("datasets/sample_submission5.csv", "w", encoding="utf8") as sub:
    sub.write("Id,Predicted\n")
    for i, label in enumerate(ints_to_labels(y_pred)):
        sub.write(f"{i},{label}\n")
        


In [31]:
coefs = np.array(clf.coef_)
coefs = coefs.reshape((coefs.shape[1],))

features = vectorizer.get_feature_names()
sorted_feat = [features[i] for i in coefs.argsort()[::-1]]
i = 0
for feat, coef in zip(sorted_feat, np.sort(coefs)[::-1]):
    if coef > 0:
        continue
    print(f"{feat} -> {coef}")
    i += 1
    if i == 50:
        break
        

vicino albergo -> -4.7261723251191894e-07
accessibile senza -> -2.199351485168593e-06
ombrellone sdraie -> -3.390571892761249e-06
entrando sembra -> -3.999289014970257e-06
solo piatto -> -4.205552414508802e-06
ovunque corridoi -> -5.569947424646523e-06
supermercato minuti -> -6.149896657216097e-06
volta primo -> -6.310632066563461e-06
molto arrivo -> -7.105661484683297e-06
aver messo -> -7.671802631618635e-06
poterci subito -> -7.835981614082526e-06
vacanza magnifico -> -8.319558067071546e-06
aria pieni -> -8.409866374325661e-06
godere terrazza -> -9.88068343798491e-06
vacanza altro -> -1.2156419119962742e-05
centrale realta -> -1.2196043207366587e-05
direzione mare -> -1.231798143004878e-05
camera camerieri -> -1.2324860406328007e-05
rilassante molto -> -1.2799416310149921e-05
quantita offerta -> -1.2960210613778973e-05
inviare alcuni -> -1.3381840153757816e-05
meraviglioso situato -> -1.5598784920740115e-05
foglietto -> -1.5717029181131387e-05
qualita connessione -> -1.60382440684584