In [None]:
!pip install gensim matplotlib nltk numpy pandas scikit-learn seaborn spacy stop_words

In [None]:
!python -m spacy download it_core_news_sm

# Data exploration

In [None]:
import pandas as pd

#hs_dev = pd.read_csv("haspeede2_dev_taskAB.tsv", sep="\t")
hs_dev = pd.read_csv("haspeede2_dev_taskAB_anon_revised.tsv")
hs_dev

* Get label distributions

In [None]:
import seaborn as sns

hs_raw = hs_dev.hs.value_counts()
hs_norm = hs_dev.hs.value_counts(normalize=True)
print(f'HS CLASS DISTRIBUTION \n(raw) \n{hs_raw} \n(%) \n{hs_norm}')
ster_raw = hs_dev.stereotype.value_counts()
ster_norm = hs_dev.stereotype.value_counts(normalize=True)
print(f'\nSTEREOTYPE CLASS DISTRIBUTION \n(raw) \n{ster_raw} \n(%) \n{ster_norm}')
hs_ster = hs_dev.groupby('hs')['stereotype'].value_counts()
print(f'\nCO-OCCURRENCE STATISTICS: \n{hs_ster} \n')
#print(sns.barplot(x=hs_norm.index, y=hs_norm))
print(sns.barplot(x=ster_norm.index, y=ster_norm))

* Get most frequent terms and n-grams

In [None]:
from nltk import bigrams, trigrams, FreqDist
from stop_words import get_stop_words

STOPWORDS = get_stop_words('it') + ["url", "user", "@user"]
text = [item for tweet in hs_dev['full_text'] for item in tweet.split(" ") if item.lower() not in STOPWORDS]
hs_text = [item for tweet in hs_dev[hs_dev.hs == 1]['full_text'] for item in tweet.split(" ") if item.lower() not in STOPWORDS]
ster_text = [item for tweet in hs_dev[hs_dev.stereotype == 1]['full_text'] for item in tweet.split(" ") if item.lower() not in STOPWORDS]

#find most common terms
fdist = FreqDist(ster_text)
fdist.most_common(20)

"""
#find most common bigrams
bgs = bigrams(ster_text)
fdist = FreqDist(bgs)
fdist.most_common(20)

#find most common trigrams
tgs = trigrams(text)
fdist = FreqDist(tgs)
fdist.most_common(20)
"""

# Cleaning and preprocessing

* tokenization
* stopwords+punctuation removal
* lowercasing
* lemmatization
* ... and many other stuff

In [None]:
import re, spacy, string
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer


def preprocess(sentence):
    #remove digits
    text = re.sub('\d+', '', sentence)
    #remove extra whitespaces
    text = re.sub('\s+', ' ', text)
    #tokenization
    text = word_tokenize(text)
    #stopwords/punctuation removal + lowercasing
    text = [token.lower() for token in text if token.lower() not in STOPWORDS and token not in string.punctuation]
    #lemmatization
    #sent = " ".join(text)
    #nlp = spacy.load("it_core_news_sm")
    #text = [token.lemma_ for token in nlp(sent)]
    #stemming
    stemmer = SnowballStemmer('italian')
    text = [stemmer.stem(token) for token in text]
    #other possible operations: handle emojis/emoticons, hashtags, URLs/email addresses, twitter handles 
    return " ".join(text) 
  

hs_dev['preprocessed'] = hs_dev['full_text'].apply(lambda x: preprocess(x))
hs_dev

# Train-Test/Validation split
(In this case we already have a held-out test set, but you might want to use a validation set as well for hyper-parameter tuning)


In [None]:
from sklearn.model_selection import train_test_split

#if you split into train and validation sets:
X_train_hs, X_val_hs, y_train_hs, y_val_hs = train_test_split(hs_dev["preprocessed"],hs_dev["hs"],test_size=0.1, shuffle=False)

#if you just use the whole training set without hyper-parameter tuning:
X_train_hs = hs_dev["preprocessed"]
y_train_hs = hs_dev["hs"]

X_train_hs



# Feature extraction and modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

#"""
#create feature vectors
count = CountVectorizer(analyzer='word', max_features=5000)
count_train = count.fit_transform(X_train_hs)
tfidf = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf_train = tfidf.fit_transform(X_train_hs)
trg = TfidfVectorizer(analyzer='word', ngram_range= (1,3), max_features=5000)
trg_train = trg.fit_transform(X_train_hs)

#fit the classifier on the training data
svm = LinearSVC() # as classifier, we just use a linear SVM with default parameters
svm_tfidf = LinearSVC()
svm_trg = LinearSVC()
svm.fit(count_train, y_train_hs)
svm_tfidf.fit(tfidf_train, y_train_hs)
svm_trg.fit(trg_train, y_train_hs)
#"""

""" alternative code using make_pipeline:
count = CountVectorizer(analyzer='word', max_features=5000)
tfidf = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf_trg = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=5000)

svm = LinearSVC()  
svm_tfidf = LinearSVC()
svm_trg = LinearSVC()

pipe_count = make_pipeline(count, svm)
pipe_tfidf = make_pipeline(tfidf, svm_tfidf)
pipe_trigrams = make_pipeline(tfidf_trg, svm_trg)

pipe_count.fit(X_train_hs, y_train_hs)
pipe_tfidf.fit(X_train_hs, y_train_hs)
pipe_trigrams.fit(X_train_hs, y_train_hs)
"""

# Evaluation and error analysis

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

#hs_test = pd.read_csv("haspeede2_reference_taskAB-tweets.tsv", sep="\t", names=['id','text','hs','stereotype'])
hs_test = pd.read_csv("haspeede2_reference_taskAB-tweets_anon_revised.tsv")
#hs_test = pd.read_csv("haspeede2_reference_taskAB-news_anon_revised.tsv")
hs_test

X_test_hs = hs_test['full_text'].apply(lambda x: preprocess(x))
y_test_hs = hs_test['hs']

#"""
count_test = count.transform(X_test_hs)
tfidf_test = tfidf.transform(X_test_hs)
trg_test = trg.transform(X_test_hs)

y_pred_count = svm.predict(count_test)
y_pred_tfidf = svm_tfidf.predict(tfidf_test)
y_pred_trg = svm_trg.predict(trg_test)
#"""
"""
y_pred_count = pipe_count.predict(X_test_hs)
y_pred_tfidf = pipe_tfidf.predict(X_test_hs)
y_pred_trg = pipe_trigrams.predict(X_test_hs)
"""

print('COUNT VECTORS: \n\nCONFUSION MATRIX ')       
print(confusion_matrix(y_test_hs, y_pred_count), '\n') 
print('EVALUATION METRICS \n',classification_report(y_test_hs, y_pred_count))
print('TF-IDF VECTORS: \n\nCONFUSION MATRIX ')       
print(confusion_matrix(y_test_hs, y_pred_tfidf), '\n') 
print('EVALUATION METRICS \n',classification_report(y_test_hs, y_pred_tfidf))
print('TF-IDF VECTORS + TRIGRAMS: \n\nCONFUSION MATRIX ')  
print(confusion_matrix(y_test_hs, y_pred_trg), '\n') 
print('EVALUATION METRICS \n',classification_report(y_test_hs, y_pred_trg))


Try using visual libraries to explain system's predictions, like [eli5](https://github.com/TeamHG-Memex/eli5) or 
[LIME](https://marcotcr.github.io/lime/). Both packages provide nice tutorials. Here an example using eli5 (source available [here](https://github.com/TeamHG-Memex/eli5/blob/master/notebooks/Debugging%20scikit-learn%20text%20classification%20pipeline.ipynb))

In [None]:
import eli5

eli5.show_weights(svm_tfidf, vec=tfidf, top=10)

Try using the best-performing model with some brand new data

In [None]:
sent = [preprocess("...")] #put here some made-up sentence, just to see how the model goes
new = tfidf.transform(sent)
y_pred_tfidf = svm_tfidf.predict(new)
print(sent, y_pred_tfidf)


To perform <b>error analysis</b>:
* select a sample of mislabeled data
* compare results with gold annotation
* get insights on possible causes of misclassification (also defining patterns, if any)