Functions to read devset and evalset and labels conversion

In [1]:
import csv
import numpy as np
def read_devset():
    X = []
    y = []
    with open("datasets/development.csv", encoding="utf8") as dev_set:
        reader = csv.reader(dev_set)
        next(reader)
        for row in reader:
            X.append(row[0])
            y.append(row[1])
    
    return X, labels_to_ints(y)

def read_evalset():
    X = []
    with open("datasets/evaluation.csv", encoding="utf8") as eval_set:
        reader = csv.reader(eval_set)
        next(reader)
        for row in reader:
            X.append(row[0])
    
    return X
def labels_to_ints(labels):
    return np.array(list(map(lambda x : 1 if x == "pos" else 0, labels)))

def ints_to_labels(labels):
    return list(map(lambda x : "pos" if x == 1 else "neg", labels))



In [2]:
X, y = read_devset()
X_ev = read_evalset()
len(X) + len(X_ev)

In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

def preprocess_word(text):
    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text) # remove URLs
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "email", text)
    text = re.sub('@[^\s]+', 'at_user', text) # remove usernames
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove the # in #hashtag
    text = re.sub("'", "' ", text)
    return text

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, text):
        lemmas = []
        regex = re.compile("[0-9]+")
        
        text = preprocess_word(text)
        for t in word_tokenize(text):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            if lemma not in string.punctuation and 3 < len(lemma) < 16 and not regex.match(lemma) and "@" not in lemma and "#" not in lemma:
                lemmas.append(lemma)
        return lemmas
    

In [9]:
from nltk.corpus import stopwords as sw
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(X):
    stopwords = sw.words('italian') + ['quantum', "url", "at_user", "email"]
    lemmaTokenizer = LemmaTokenizer()
    # Trasforma le liste di token in numeri che vengono inseriti in una matrice di numpy float in modo da poterli analizzare
    vectorizer = TfidfVectorizer(input="content", tokenizer=lemmaTokenizer, stop_words=stopwords)
    return vectorizer.fit_transform(X)

In [10]:
tfidf_X = tfidf(X)
tfidf_X



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
X_train, X_test, y_train, y_test = train_test_split(tfidf_X, y)
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", criterion="gini")
clf.fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test))

In [6]:
from sklearn.model_selection import KFold, ParameterGrid
params = {
    "n_estimators": [10, 50, 100],
    "max_features": ["sqrt"],
    "criterion": ["gini"],
    "min_impurity_decrease": [0, .01, .05]
}

kf = KFold(3)

f1s = []
for config in ParameterGrid(params):
    clf_f1s = []
    counts = []
    for train_indices, valid_indices in kf.split(X_train):
        X_t = X_train[train_indices]
        y_t = y_train[train_indices]
        X_v = X_train[valid_indices]
        y_v = y_train[valid_indices]
        # keep track of the number of elements in each split
        counts.append(len(train_indices))
        clf = RandomForestClassifier(**config)
        clf.fit(X_t, y_t)
        acc = f1_score(y_v, clf.predict(X_v))
        clf_f1s.append(acc)
    f1s.append(np.average(clf_f1s, weights=counts))

In [12]:
tfidf_ev = tfidf(X_ev)

best_config = list(ParameterGrid(params))[int(np.argmax(f1s))]
clf = RandomForestClassifier(**best_config)
clf.fit(tfidf_X, y)
y_pred = clf.predict(tfidf_ev) 