#Functions to read devset and evalset and labels conversion

In [1]:
import csv
import numpy as np

def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))


In [2]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

41077

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True)

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text) # remove email
    text = re.sub('@[^\s]+', 'at', text) # remove eventually @<word>
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove eventually hashtag 
    text = re.sub(r'[#@]', " ", text) # remove eventually isolated @#
    text = re.sub("'", "' ", text)  #insert a space in every word with the <'> in order to be catched by tokenizer and filtered
    text = re.sub(r"([.]*)([0-9]+)([.]*)", r"\0 \1", text) # i.e ciao123come20va -> ciao come va
    text = re.sub("([*.\-/+])", " ", text) # remove special characters from words of kind "..<word>" "***" "-<word>" "..<word>"
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma):
                lemmas.append(lemma)
        return lemmas


In [5]:
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords as sw
lemmaTokenizer = LemmaTokenizer()
stopwords = sw.words('italian') + ['quantum', "url", "at", "email"]
params = {
    "input": ["content"],
    "tokenizer": [lemmaTokenizer],
    "stop_words": [stopwords],
    "binary": [True],
    "min_df": [1, 2, 3],
    "ngram_range": [(1,1), (1,2), (2,2)]
}
configs = ParameterGrid(params)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizers = []
X_train_sets = []
X_test_sets = []
for conf in configs:
    vectorizer = TfidfVectorizer(**conf)
    X_train_sets.append(vectorizer.fit_transform(X_train, y_train))
    X_test_sets.append(vectorizer.transform(X_test))
    vectorizers.append(vectorizer)

In [7]:
for conf, vec in zip(configs, vectorizers):
    print(f"{conf['ngram_range']}: {len(vec.vocabulary_)}")
    

(1, 1): 44739
(1, 2): 711840
(2, 2): 667101
(1, 1): 23612
(1, 2): 149949
(2, 2): 126337
(1, 1): 17947
(1, 2): 81908
(2, 2): 63961


In [8]:
for vec, train, test in zip(vectorizers,X_train_sets, X_test_sets):
    print("Vocabulary size: {}".format(len(vec.vocabulary_)))
    print("X_train:\n{}".format(repr(train)))
    print("X_test: \n{}".format(repr(test)))
    

Vocabulary size: 44739
X_train:
<21565x44739 sparse matrix of type '<class 'numpy.float64'>'
	with 1156264 stored elements in Compressed Sparse Row format>
X_test: 
<7189x44739 sparse matrix of type '<class 'numpy.float64'>'
	with 377118 stored elements in Compressed Sparse Row format>
Vocabulary size: 711840
X_train:
<21565x711840 sparse matrix of type '<class 'numpy.float64'>'
	with 2423383 stored elements in Compressed Sparse Row format>
X_test: 
<7189x711840 sparse matrix of type '<class 'numpy.float64'>'
	with 618107 stored elements in Compressed Sparse Row format>
Vocabulary size: 667101
X_train:
<21565x667101 sparse matrix of type '<class 'numpy.float64'>'
	with 1267119 stored elements in Compressed Sparse Row format>
X_test: 
<7189x667101 sparse matrix of type '<class 'numpy.float64'>'
	with 240989 stored elements in Compressed Sparse Row format>
Vocabulary size: 23612
X_train:
<21565x23612 sparse matrix of type '<class 'numpy.float64'>'
	with 1135137 stored elements in Compres

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grids = []
param_grid = {'C': [0.01, 0.1, 1, 10], 'max_iter': [200]}
for train, vec in zip(X_train_sets, vectorizers):
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1')
    grid.fit(train, y_train)
    grids.append(grid)
    

In [10]:
scores = list(map(lambda x: x.best_score_, grids))
print(f"Best f1 score: {np.max(scores)}")
best_params = grids[int(np.argmax(scores))].best_params_
print(f"Best parameters: {best_params}")
best_estimator = grids[int(np.argmax(scores))].best_estimator_
print(f"Best estimator: {best_estimator}")
best_vectorizer_params = vectorizers[int(np.argmax(scores))].get_params()

Best f1 score: 0.9721007920437318
Best parameters: {'C': 10, 'max_iter': 200}
Best estimator: LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


FINAL TRAIN

In [11]:
best_vectorizer_params['strip_accents'] = 'unicode' #Added now because of too much computational time need to find best params
vectorizer = TfidfVectorizer(**best_vectorizer_params)
X_count_train = vectorizer.fit_transform(X, y)
X_count_test = vectorizer.transform(X_ev)

clf = LogisticRegression(**best_params)
clf.fit(X_count_train, y)
y_pred = clf.predict(X_count_test)


  'stop_words.' % sorted(inconsistent))


In [12]:
with open("datasets/sample_submission4.csv", "w", encoding="utf8") as sub:
    sub.write("Id,Predicted\n")
    for i, label in enumerate(ints_to_labels(y_pred)):
        sub.write(f"{i},{label}\n")
        


In [13]:
coefs = np.array(clf.coef_)
coefs = coefs.reshape((coefs.shape[1],))

features = vectorizer.get_feature_names()
sorted_feat = [features[i] for i in coefs.argsort()[::-1]]
i = 0
for feat, coef in zip(sorted_feat, np.sort(coefs)[::-1]):
    if coef > 0:
        continue
    print(f"{feat} -> {coef}")
    i += 1
    if i == 50:
        break
        

preso doccia -> -6.2700007994435735e-06
sveglia mattina -> -2.5728052909494025e-05
futuro sicuramente -> -2.6986929333871216e-05
casa sicuramente -> -3.7173314086693324e-05
soggiorno parte -> -3.7552740708472045e-05
superior prezzo -> -4.604393596280508e-05
stanza trovava -> -4.712287184590667e-05
euro tariffa -> -5.003282150786284e-05
citta italiana -> -6.088619205873695e-05
firenze hotel -> -6.258133841101989e-05
indicato bambino -> -6.99975026400312e-05
sempre alloggiato -> -7.709341079784525e-05
camera entro -> -7.946094534968798e-05
essi andati -> -8.080741093211187e-05
chiesto spostare -> -8.189645793903221e-05
spaziose pulizia -> -8.313094674195736e-05
tornare visitare -> -8.71080545846913e-05
trovato senza -> -8.799947027786401e-05
negozi abbigliamento -> -8.813827939710142e-05
varie docce -> -9.15095453673666e-05
stata spenta -> -9.23625295261589e-05
sottolineare presenza -> -9.53982905016611e-05
fino quasi -> -0.00010952620649942399
pulita arrivo -> -0.0001259577731009518
sta