#Functions to read devset and evalset and labels conversion

In [71]:
import csv
import numpy as np

def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))


In [72]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

41077

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True)

In [74]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text) # remove email
    text = re.sub('@[^\s]+', 'at', text) # remove eventually @<word>
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove eventually hashtag 
    text = re.sub(r'[#@]', " ", text) # remove eventually isolated @#
    text = re.sub("'", "' ", text)  #insert a space in every word with the <'> in order to be catched by tokenizer and filtered
    text = re.sub(r"([.]*)([0-9]+)([.]*)", r"\0 \1", text) # i.e ciao123come20va -> ciao come va
    text = re.sub("([*.\-/])", " ", text) # remove special characters from words of kind "..<word>" "***" "-<word>" "..<word>"
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma):
                lemmas.append(lemma)
        return lemmas


In [75]:
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords as sw
lemmaTokenizer = LemmaTokenizer()
stopwords = sw.words('italian') + ['quantum', "url", "at", "email"]
params = {
    "input": ["content"],
    "tokenizer": [lemmaTokenizer],
    "stop_words": [stopwords],
    "binary": [True],
    "min_df": [1, 2, 3],
    "ngram_range": [(1,1), (1,2), (2,2)]
}
configs = ParameterGrid(params)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizers = []
X_train_sets = []
X_test_sets = []
for conf in configs:
    vectorizer = CountVectorizer(**conf)
    X_train_sets.append(vectorizer.fit_transform(X_train, y_train))
    X_test_sets.append(vectorizer.transform(X_test))
    vectorizers.append(vectorizer)

In [77]:
for conf, vec in zip(configs, vectorizers):
    print(f"{conf['ngram_range']}: {len(vec.vocabulary_)}")
    

(1, 1): 44660
(1, 2): 712046
(2, 2): 667386
(1, 1): 23586
(1, 2): 149575
(2, 2): 125989
(1, 1): 17927
(1, 2): 81682
(2, 2): 63755


In [78]:
for vec, train, test in zip(vectorizers,X_train_sets, X_test_sets):
    print("Vocabulary size: {}".format(len(vec.vocabulary_)))
    print("X_train:\n{}".format(repr(train)))
    print("X_test: \n{}".format(repr(test)))
    

Vocabulary size: 44660
X_train:
<21565x44660 sparse matrix of type '<class 'numpy.int64'>'
	with 1154799 stored elements in Compressed Sparse Row format>
X_test: 
<7189x44660 sparse matrix of type '<class 'numpy.int64'>'
	with 378470 stored elements in Compressed Sparse Row format>
Vocabulary size: 712046
X_train:
<21565x712046 sparse matrix of type '<class 'numpy.int64'>'
	with 2420455 stored elements in Compressed Sparse Row format>
X_test: 
<7189x712046 sparse matrix of type '<class 'numpy.int64'>'
	with 620932 stored elements in Compressed Sparse Row format>
Vocabulary size: 667386
X_train:
<21565x667386 sparse matrix of type '<class 'numpy.int64'>'
	with 1265656 stored elements in Compressed Sparse Row format>
X_test: 
<7189x667386 sparse matrix of type '<class 'numpy.int64'>'
	with 242462 stored elements in Compressed Sparse Row format>
Vocabulary size: 23586
X_train:
<21565x23586 sparse matrix of type '<class 'numpy.int64'>'
	with 1133725 stored elements in Compressed Sparse Row

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grids = []
param_grid = {'C': [0.01, 0.1, 1, 10], 'max_iter': [200]}
for train, vec in zip(X_train_sets, vectorizers):
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1')
    grid.fit(train, y_train)
    grids.append(grid)
    

In [80]:
scores = list(map(lambda x: x.best_score_, grids))
print(f"Best f1 score: {np.max(scores)}")
best_params = grids[int(np.argmax(scores))].best_params_
print(f"Best parameters: {best_params}")
best_estimator = grids[int(np.argmax(scores))].best_estimator_
print(f"Best estimator: {best_estimator}")
best_vectorizer_params = vectorizers[int(np.argmax(scores))].get_params()

Best f1 score: 0.9688227146887367
Best parameters: {'C': 1, 'max_iter': 200}
Best estimator: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


FINAL TRAIN

In [81]:
best_vectorizer_params['strip_accents'] = 'unicode' #Added now because of too much computational time need to find best params
vectorizer = CountVectorizer(**best_vectorizer_params)
X_count_train = vectorizer.fit_transform(X, y)
X_count_test = vectorizer.transform(X_ev)

clf = LogisticRegression(**best_params)
clf.fit(X_count_train, y)
y_pred = clf.predict(X_count_test)


  'stop_words.' % sorted(inconsistent))


In [82]:
with open("datasets/sample_submission3.csv", "w", encoding="utf8") as sub:
    sub.write("Id,Predicted\n")
    for i, label in enumerate(ints_to_labels(y_pred)):
        sub.write(f"{i},{label}\n")
        


In [83]:
coefs = np.array(clf.coef_)
coefs = coefs.reshape((coefs.shape[1],))

features = vectorizer.get_feature_names()
sorted_feat = [features[i] for i in coefs.argsort()[::-1]]
i = 0
for feat, coef in zip(sorted_feat, np.sort(coefs)[::-1]):
    if coef > 0:
        continue
    print(f"{feat} -> {coef}")
    i += 1
    if i == 50:
        break
        

pervasi -> -1.6235575495479116e-09
ancora bagno -> -2.3364043695831975e-09
venezia rivelata -> -3.209096200218581e-09
pancake colazione -> -4.8135505428284905e-09
prelevamenti -> -6.79723861400133e-09
bagno marito -> -8.767778018830704e-09
illuminavano -> -1.1657707795309708e-08
primo utilizzo -> -1.6828240357287504e-08
mano doccia -> -2.1915432061536262e-08
camere identiche -> -2.2428297596750818e-08
avere fetta -> -2.4709969323760597e-08
gita venezia -> -2.755426372583288e-08
acqua vedi -> -2.802656808810258e-08
minuti entrare -> -3.427620905539197e-08
veramente sappiamo -> -3.4501935301296504e-08
ascensori funzionano -> -3.5345051964663546e-08
concierge leonardo -> -3.7659248997556644e-08
mezzo comunque -> -3.782004725916649e-08
favore dato -> -3.928050787968651e-08
almeno sino -> -4.057865604695144e-08
anniversario molto -> -4.2064781598391375e-08
tariffe economiche -> -4.439855293251169e-08
hotel compagnia -> -4.7059030258576766e-08
vicino vetrata -> -4.7868046674947587e-08
deside