In [None]:
import pandas as pd
import numpy as np
import nltk

In [None]:
dev = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_and_eval.csv")
dev = dev.loc[dev["set"]=="dev", :]

In [None]:
dev.shape

In [None]:
dev.drop_duplicates("ids", keep=False, inplace=True)

In [None]:
dev.shape

In [None]:
dev.drop(columns=["ids", "flag", "set"], inplace=True)

In [None]:
dev.head(2)

# Feature engineering

## Orientation
Extract the total and positive number of tweets of each user. Used to compute the feature _orientation_ we have defined as:
$$o(user, max_{tweets}) = \frac{n_{user, pos}}{max_{tweets}} - \frac{n_{uesr, neg}}{max_{tweets}}$$

In [None]:
from collections import defaultdict

def orientation_data(df):
    user_sentiment_cnt = defaultdict(lambda: [0, 0])

    for _, row in df.iterrows():
        user_sentiment_cnt[row["user"]][0] += row["sentiment"]
        user_sentiment_cnt[row["user"]][1] += 1

    max_tweets = max(user_sentiment_cnt.values(), key=lambda t: t[1])[1]

    return user_sentiment_cnt, max_tweets

def orientation(data, u, max_tweets):
    if u in data:
        return data[u][0] / max_tweets - (data[u][1] - data[u][0]) / max_tweets
    else:
        return 0

## Features about text

In [None]:
import re
import string
from html import unescape

tags = re.compile("@\w")
hashtags = re.compile("#\w")
urls = re.compile("(http|https)?:?\/?\/?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])")
esclamation = re.compile("\!")
question = re.compile("\?")
happy = re.compile(r"([\:\;]'?-?[\)DPp])")
sad = re.compile(r"D-?'?\:|[\:\;]['-]?-?[(\\\/cC]")

def text_features(row):
    row["text"] = unescape(row["text"])  # unescape html entities

    row["#upper"] = sum(map(str.isupper, row["text"]))
    row["#words"] = len(row["text"].split(" "))
    row["#punct"] = sum([1 for c in row["text"] if c in  string.punctuation])
    row["#!"] = len(esclamation.findall(row["text"]))
    row["#?"] = len(question.findall(row["text"]))

    row["#user_tags"] = len(tags.findall(row["text"]))
    row["#hashtags"] = len(hashtags.findall(row["text"]))
    row["#urls"] = len(urls.findall(row["text"]))
    row["#happy_emot"] = len(happy.findall(row["text"]))
    row["text"] = happy.sub("_HAPPY_EMOT_", row["text"])
    row["#sad_emot"] = len(sad.findall(row["text"]))
    row["text"] = sad.sub("_SAD_EMOT_", row["text"])

    return row

In [None]:
dev = dev.apply(lambda r: text_features(r), axis=1)

In [None]:
dev["text"] = dev.apply(lambda r: r["text"] + " " + r["user"] + " " + r["date"].split(" ")[0], axis=1)

In [None]:
dev.to_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_mod.csv")
dev.head(2)

# Tuning

In [None]:
dev = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_mod.csv")

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(dev, dev["sentiment"], test_size=0.25, random_state=20)

## Tf-idf

In [None]:
custom_sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
    'he', 'she', "she's", 'it', "it's", 'they', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
    'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'all', 'any', 'both', 'each', 'other', 'some', 'such',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just', 'now', 'd', 'll', 'm', 'o',
    're','ve','y','ma']

In [None]:
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None
    
def text_prep(t):
    t = t.lower()
    wb = tk.tokenize(t)
    wb = [w for w in wb if len(w) < 20]
    wb = [w for w in wb if w not in custom_sw]
    wst = nltk.pos_tag(wb)
    wst_new = []
    for e in wst:
        wst_new.append((e[0], get_wordnet_pos(e[1])))
    lemma = nltk.stem.WordNetLemmatizer()
    wl = [lemma.lemmatize(w, pos=p) if p != None else lemma.lemmatize(w) for w,p in wst_new]
    return " ".join([w for w in wl])

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

params = {
    "binary": [True, False],
    "use_idf": [True, False],
    "ngram_range": [(1,2)],
    "lowercase": [True],
    "preprocessor": [text_prep],
    "tokenizer": [tk.tokenize],
    "max_features": [5000]
}

for param in ParameterGrid(params):
    tfidf = TfidfVectorizer(**param)
    X = tfidf.fit_transform(X_tr["text"])
    x = tfidf.transform(X_te["text"])

    tsvd = TruncatedSVD(n_components=500, random_state=20)
    X = tsvd.fit_transform(X)
    x = tsvd.transform(x)

    rfc = RandomForestClassifier(random_state=20, n_jobs=8)
    rfc.fit(X, y_tr)
    print(param, f1_score(y_te, rfc.predict(x)))
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    x = scaler.transform(x)

    lsvc = LinearSVC(random_state=20)
    lsvc.fit(X, y_tr)
    print(param, f1_score(y_te, lsvc.predict(x)))
    
    del rfc, lsvc, scaler
    
# tfidf_rfc {'binary': True, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2), 'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}
# tfidf_lsvc {'binary': False, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2), 'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}

## Random Forest

In [None]:
from sklearn.preprocessing import StandardScaler

data, maxtw = orientation_data(X_tr)

X_tr["orientation"] = X_tr["user"].apply(lambda u: orientation(data, u, maxtw))
X_te["orientation"] = X_te["user"].apply(lambda u: orientation(data, u, maxtw))

Xnum = X_tr[["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct", "#!", "#?", "#user_tags",
               "#hashtags", "#urls", "#happy_emot", "#sad_emot"]]
xnum = X_te[["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct", "#!", "#?", "#user_tags",
               "#hashtags", "#urls", "#happy_emot", "#sad_emot"]]

scaler = StandardScaler(with_mean=False)
Xnum = scaler.fit_transform(Xnum)
xnum = scaler.transform(xnum)

In [None]:
from matplotlib import pyplot as plt

rfc = RandomForestClassifier(random_state=20, n_jobs=-1)

rfc.fit(Xnum, y_tr)

importances = rfc.feature_importances_
stds = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)

forest_importances = pd.Series(importances, index=["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct", "#!", "#?", "#user_tags",
               "#hashtags", "#urls", "#happy_emot", "#sad_emot"])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=stds, ax=ax)
ax.set_title("Feature importances in RFC")
ax.set_ylabel("Importance")
plt.savefig("importance.pdf", bbox_inches="tight")

In [None]:
Xnum = X_tr[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]
xnum = X_te[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]

scaler = StandardScaler(with_mean=False)
Xnum = scaler.fit_transform(Xnum)
xnum = scaler.transform(xnum)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

best_rfc = {'binary': True, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2),
            'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}

tfidf = TfidfVectorizer(**best_rfc)
X = tfidf.fit_transform(X_tr["text"])
x = tfidf.transform(X_te["text"])

tsvd = TruncatedSVD(n_components=500, random_state=20)
X = tsvd.fit_transform(X)
x = tsvd.transform(x)

In [None]:
X = np.hstack([X, Xnum])
x = np.hstack([x, xnum])

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

params = {
    "n_estimators": [200, 300, 400],
    "criterion": ["gini", "entropy"],
    "max_features": ["sqrt", "log2"],
    "random_state": [20],
    "n_jobs": [-1]
}

for param in ParameterGrid(params):
    rfc = RandomForestClassifier(**param)
    rfc.fit(X, y_tr)
    print(param, f1_score(y_te, rfc.predict(x)))
    del rfc
    
# {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 400, 'n_jobs': -1, 'random_state': 20}

## Linear SVC

In [None]:
from sklearn.preprocessing import StandardScaler

data, maxtw = orientation_data(X_tr)

X_tr["orientation"] = X_tr["user"].apply(lambda u: orientation(data, u, maxtw))
X_te["orientation"] = X_te["user"].apply(lambda u: orientation(data, u, maxtw))

Xnum = X_tr[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]
xnum = X_te[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]

scaler = StandardScaler()
Xnum = scaler.fit_transform(Xnum)
xnum = scaler.transform(xnum)

Xnum = np.hstack([X_tr["orientation"].values.reshape(-1, 1), Xnum])
xnum = np.hstack([X_te["orientation"].values.reshape(-1, 1), xnum])
scaler = StandardScaler(with_mean=False)
Xnum = scaler.fit_transform(Xnum)
xnum = scaler.transform(xnum)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

best_lsvc = {'binary': False, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2),
            'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}

tfidf = TfidfVectorizer(**best_lsvc)
X = tfidf.fit_transform(X_tr["text"])
x = tfidf.transform(X_te["text"])

tsvd = TruncatedSVD(n_components=500, random_state=20)
X = tsvd.fit_transform(X)
x = tsvd.transform(x)

scaler = StandardScaler()
X = scaler.fit_transform(X)
x = scaler.transform(x)

X = np.hstack([X, Xnum])
x = np.hstack([x, xnum])

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

params = {
    "C": [i/100 for i in range(10, 101, 2)],
    "class_weight": ["balanced", None],
    "dual": [False],
    "max_iter": [5000],
    "random_state": [20]
}

for param in ParameterGrid(params):
    lsvc = LinearSVC(**param)
    lsvc.fit(X, y_tr)
    print(param, f1_score(y_te, lsvc.predict(x)))
    del lsvc
    
# {"C": 0.32, "class_weight": None, "dual": False, "max_iter": 5000, "random_state": 20}