#Functions to read devset and evalset and labels conversion

In [1]:
import csv
import numpy as np

def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))


In [2]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

41077

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True)

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text)
    text = re.sub('@[^\s]+', 'at_user', text) # remove usernames
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove the # in #hashtag
    text = re.sub("'", "' ", text)
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma) and "@" not in lemma and "#" not in lemma:
                lemmas.append(lemma)
        return lemmas


In [5]:
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords as sw
lemmaTokenizer = LemmaTokenizer()
stopwords = sw.words('italian') + ['quantum', "url", "at_user", "email"]
params = {
    "input": ["content"],
    "tokenizer": [lemmaTokenizer],
    "stop_words": [stopwords],
    "binary": [True],
    "min_df": [3, 5],
    "ngram_range": [(1,1), (1,2), (2,2)]
}
configs = ParameterGrid(params)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizers = []
X_train_sets = []
X_test_sets = []
for conf in configs:
    vectorizer = CountVectorizer(**conf)
    X_train_sets.append(vectorizer.fit_transform(X_train, y_train))
    X_test_sets.append(vectorizer.transform(X_test))
    vectorizers.append(vectorizer)
    print(vectorizer)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=3,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['ad', 'al', 'allo', 'ai', 'agli', 'all', 'agl',
                            'alla', 'alle', 'con', 'col', 'coi', 'da', 'dal',
                            'dallo', 'dai', 'dagli', 'dall', 'dagl', 'dalla',
                            'dalle', 'di', 'del', 'dello', 'dei', 'degli',
                            'dell', 'degl', 'della', 'delle', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<__main__.LemmaTokenizer object at 0x000002CE368BC148>,
                vocabulary=None)
CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
             

In [9]:
for conf, vec in zip(configs, vectorizers):
    print(f"{conf['ngram_range']}: {len(vec.vocabulary_)}")
    

(1, 1): 18017
(1, 2): 80855
(2, 2): 62838
(1, 1): 13019
(1, 2): 42972
(2, 2): 29953


In [14]:
for vec, train, test in zip(vectorizers,X_train_sets, X_test_sets):
    print("Vocabulary size: {}".format(len(vec.vocabulary_)))
    print("X_train:\n{}".format(repr(train)))
    print("X_test: \n{}".format(repr(test)))
    

Vocabulary size: 18017
X_train:
<21565x18017 sparse matrix of type '<class 'numpy.int64'>'
	with 1110443 stored elements in Compressed Sparse Row format>
X_test: 
<7189x18017 sparse matrix of type '<class 'numpy.int64'>'
	with 373223 stored elements in Compressed Sparse Row format>
Vocabulary size: 80855
X_train:
<21565x80855 sparse matrix of type '<class 'numpy.int64'>'
	with 1700099 stored elements in Compressed Sparse Row format>
X_test: 
<7189x80855 sparse matrix of type '<class 'numpy.int64'>'
	with 548442 stored elements in Compressed Sparse Row format>
Vocabulary size: 62838
X_train:
<21565x62838 sparse matrix of type '<class 'numpy.int64'>'
	with 589656 stored elements in Compressed Sparse Row format>
X_test: 
<7189x62838 sparse matrix of type '<class 'numpy.int64'>'
	with 175219 stored elements in Compressed Sparse Row format>
Vocabulary size: 13019
X_train:
<21565x13019 sparse matrix of type '<class 'numpy.int64'>'
	with 1093535 stored elements in Compressed Sparse Row format

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grids = []
param_grid = {'C': [0.001, 0.01, 0.1, 1], 'max_iter': [1000]}
for train, vec in zip(X_train_sets, vectorizers):
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1')
    grid.fit(train, y_train)
    grids.append(grid)
    

'print("Best cross-validation score: {:.2f}".format(grid.best_score_))\nprint("Best parameters: ", grid.best_params_)\nprint("Best estimator: ", grid.best_estimator_)'

In [20]:
scores = list(map(lambda x: x.best_score_, grids))
print(f"Best f1 score: {np.max(scores)}")
best_params = grids[int(np.argmax(scores))].best_params_
print(f"Best parameters: {best_params}")
best_estimator = grids[int(np.argmax(scores))].best_estimator_
print(f"Best estimator: {best_estimator}")
best_vectorizer_params = vectorizers[int(np.argmax(scores))].get_params()

Best f1 score: 0.9675635362400158
Best parameters: {'C': 1, 'max_iter': 1000}
Best estimator: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


FINAL TRAIN

In [21]:
vectorizer = CountVectorizer(**best_vectorizer_params)
X_count_train = vectorizer.fit_transform(X, y)
X_count_test = vectorizer.transform(X_ev)

clf = LogisticRegression(**best_params)
clf.fit(X_count_train, y)
y_pred = clf.predict(X_count_test)


In [22]:
with open("datasets/sample_submission2.csv", "w", encoding="utf8") as sub:
    sub.write("Id,Predicted\n")
    for i, label in enumerate(ints_to_labels(y_pred)):
        sub.write(f"{i},{label}\n")