#Functions to read devset and evalset and labels conversion

In [1]:
import csv
import numpy as np

def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))


In [2]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

41077

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True)

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text) # remove email
    text = re.sub('@[^\s]+', 'at', text) # remove eventually @<word>
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove eventually hashtag 
    text = re.sub(r'[#@]', " ", text) # remove eventually isolated @#
    text = re.sub("'", "' ", text)  #insert a space in every word with the <'> in order to be catched by tokenizer and filtered
    text = re.sub(r"([.]*)([0-9]+)([.]*)", r"\0 \1", text) # i.e ciao123come20va -> ciao come va
    text = re.sub("([*.\-/])", " ", text) # remove special characters from words of kind "..<word>" "***" "-<word>" "..<word>"
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma):
                lemmas.append(lemma)
        return lemmas


In [5]:
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords as sw
lemmaTokenizer = LemmaTokenizer()
stopwords = sw.words('italian') + ['quantum', "url", "at", "email"]
params = {
    "input": ["content"],
    "tokenizer": [lemmaTokenizer],
    "stop_words": [stopwords],
    "binary": [True],
    "min_df": [1, 2, 3],
    "ngram_range": [(1,1), (1,2)]
}
configs = ParameterGrid(params)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizers = []
X_train_sets = []
X_test_sets = []
for conf in configs:
    vectorizer = TfidfVectorizer(**conf)
    X_train_sets.append(vectorizer.fit_transform(X_train, y_train))
    X_test_sets.append(vectorizer.transform(X_test))
    vectorizers.append(vectorizer)

In [None]:
for conf, vec in zip(configs, vectorizers):
    print(f"{conf['ngram_range']}: {len(vec.vocabulary_)}")
    

In [None]:
for vec, train, test in zip(vectorizers,X_train_sets, X_test_sets):
    print("Vocabulary size: {}".format(len(vec.vocabulary_)))
    print("X_train:\n{}".format(repr(train)))
    print("X_test: \n{}".format(repr(test)))
    

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grids = []
param_grid = {'C': [0.01, 0.1, 1, 10], 'max_iter': [200]}
for train, vec in zip(X_train_sets, vectorizers):
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1')
    grid.fit(train, y_train)
    grids.append(grid)
    

In [None]:
scores = list(map(lambda x: x.best_score_, grids))
print(f"Best f1 score: {np.max(scores)}")
best_params = grids[int(np.argmax(scores))].best_params_
print(f"Best parameters: {best_params}")
best_estimator = grids[int(np.argmax(scores))].best_estimator_
print(f"Best estimator: {best_estimator}")
best_vectorizer_params = vectorizers[int(np.argmax(scores))].get_params()

FINAL TRAIN

In [None]:
best_vectorizer_params['strip_accents'] = 'unicode' #Added now because of too much computational time need to find best params
vectorizer = TfidfVectorizer(**best_vectorizer_params)
X_count_train = vectorizer.fit_transform(X, y)
X_count_test = vectorizer.transform(X_ev)

clf = LogisticRegression(**best_params)
clf.fit(X_count_train, y)
y_pred = clf.predict(X_count_test)


In [None]:
with open("datasets/sample_submission5.csv", "w", encoding="utf8") as sub:
    sub.write("Id,Predicted\n")
    for i, label in enumerate(ints_to_labels(y_pred)):
        sub.write(f"{i},{label}\n")
        


In [None]:
coefs = np.array(clf.coef_)
coefs = coefs.reshape((coefs.shape[1],))

features = vectorizer.get_feature_names()
sorted_feat = [features[i] for i in coefs.argsort()[::-1]]
i = 0
for feat, coef in zip(sorted_feat, np.sort(coefs)[::-1]):
    if coef > 0:
        continue
    print(f"{feat} -> {coef}")
    i += 1
    if i == 50:
        break
        