In [None]:
import pandas as pd
import numpy as np
import nltk

In [None]:
eva = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_and_eval.csv")
eva = eva.loc[eva["set"]=="test"]

In [None]:
eva.drop(columns=["ids", "flag", "set", "sentiment", "sentiment_lbl"], inplace=True)
eva.head(2)

In [None]:
dev = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_mod.csv")
eva = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/eval_mod.csv")

In [None]:
true_labels = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/true_labels.csv")

# Data Preparation

In [None]:
from collections import defaultdict

def orientation_data(df):
    user_sentiment_cnt = defaultdict(lambda: [0, 0])

    for _, row in df.iterrows():
        user_sentiment_cnt[row["user"]][0] += row["sentiment"]
        user_sentiment_cnt[row["user"]][1] += 1

    max_tweets = max(user_sentiment_cnt.values(), key=lambda t: t[1])[1]

    return user_sentiment_cnt, max_tweets

def orientation(data, u, max_tweets):
    if u in data:
        return data[u][0] / max_tweets - (data[u][1] - data[u][0]) / max_tweets
    else:
        return 0

In [None]:
import re
import string
from html import unescape

tags = re.compile("@\w")
hashtags = re.compile("#\w")
urls = re.compile("(http|https)?:?\/?\/?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])")
esclamation = re.compile("\!")
question = re.compile("\?")
happy = re.compile(r"([\:\;]'?-?[\)DPp])")
sad = re.compile(r"D-?'?\:|[\:\;]['-]?-?[(\\\/cC]")

def text_features(row):
    row["text"] = unescape(row["text"])  # unescape html entities

    row["#upper"] = sum(map(str.isupper, row["text"]))
    row["#words"] = len(row["text"].split(" "))
    row["#punct"] = sum([1 for c in row["text"] if c in  string.punctuation])
    row["#!"] = len(esclamation.findall(row["text"]))
    row["#?"] = len(question.findall(row["text"]))

    row["#user_tags"] = len(tags.findall(row["text"]))
    row["#hashtags"] = len(hashtags.findall(row["text"]))
    row["#urls"] = len(urls.findall(row["text"]))
    row["#happy_emot"] = len(happy.findall(row["text"]))
    row["text"] = happy.sub("_HAPPY_EMOT_", row["text"])
    row["#sad_emot"] = len(sad.findall(row["text"]))
    row["text"] = sad.sub("_SAD_EMOT_", row["text"])

    return row

In [None]:
eva = eva.apply(lambda r: text_features(r), axis=1)

In [None]:
eva["text"] = eva.apply(lambda r: r["text"] + " " + r["user"] + " " + r["date"].split(" ")[0], axis=1)

eva.to_csv("/content/drive/MyDrive/Colab Notebooks/DSL/eval_mod.csv")

# Test our models

In [None]:
custom_sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
    'he', 'she', "she's", 'it', "it's", 'they', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
    'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'all', 'any', 'both', 'each', 'other', 'some', 'such',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just', 'now', 'd', 'll', 'm', 'o',
    're','ve','y','ma']

In [None]:
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None
    
def text_prep(t):
    t = t.lower()
    wb = tk.tokenize(t)
    wb = [w for w in wb if len(w) < 20]
    wb = [w for w in wb if w not in custom_sw]
    wst = nltk.pos_tag(wb)
    wst_new = []
    for e in wst:
        wst_new.append((e[0], get_wordnet_pos(e[1])))
    lemma = nltk.stem.WordNetLemmatizer()
    wl = [lemma.lemmatize(w, pos=p) if p != None else lemma.lemmatize(w) for w,p in wst_new]
    return " ".join([w for w in wl])

## RFC

In [None]:
from sklearn.preprocessing import StandardScaler

data, maxtw = orientation_data(dev)

dev["orientation"] = dev["user"].apply(lambda u: orientation(data, u, maxtw))
eva["orientation"] = eva["user"].apply(lambda u: orientation(data, u, maxtw))

dnum = dev[["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]
enum = eva[["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]

scaler = StandardScaler(with_mean=False)
dnum = scaler.fit_transform(dnum)
enum = scaler.transform(enum)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

best_rfc = {'binary': True, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2),
            'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}

tfidf = TfidfVectorizer(**best_rfc)
X = tfidf.fit_transform(dev["text"])
x = tfidf.transform(eva["text"])

tsvd = TruncatedSVD(n_components=500, random_state=20)
X = tsvd.fit_transform(X)
x = tsvd.transform(x)

In [None]:
X = np.hstack([X, dnum])
x = np.hstack([x, enum])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

rfc = RandomForestClassifier(**{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 400,
                              'n_jobs': -1, 'random_state': 20})
rfc.fit(X, dev["sentiment"])

In [None]:
print(f1_score(true_labels, rfc.predict(x))) # tuning 0.85213, testing 0.85208

In [None]:
from matplotlib import pyplot as plt

importances = rfc.feature_importances_
stds = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)

forest_importances = pd.Series(importances, index=[f"P{i}" for i in range(500)]+["orientation", "weekday", "hour", "timestamp", "#upper", "#words", "#punct"])
forest_importances.sort_values(ascending=False, inplace=True)
forest_importances = forest_importances[:5]
stds = stds[:5]

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances in RFC")
ax.set_ylabel("Importance")
plt.savefig("final_importance.pdf", bbox_inches="tight")

## LSVC

In [None]:
from sklearn.preprocessing import StandardScaler

data, maxtw = orientation_data(dev)

dev["orientation"] = dev["user"].apply(lambda u: orientation(data, u, maxtw))
eva["orientation"] = eva["user"].apply(lambda u: orientation(data, u, maxtw))

dnum = dev[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]
enum = eva[["weekday", "hour", "timestamp", "#upper", "#words", "#punct"]]

scaler = StandardScaler()
dnum = scaler.fit_transform(dnum)
enum = scaler.transform(enum)

scaler = StandardScaler(with_mean=False)
dori = scaler.fit_transform(dev["orientation"].values.reshape(-1, 1))
eori = scaler.fit_transform(eva["orientation"].values.reshape(-1, 1))

dnum = np.hstack([dnum, dori.reshape((-1, 1))])
enum = np.hstack([enum, eori.reshape((-1, 1))])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

best_lsvc = {'binary': False, 'lowercase': True, 'max_features': 5000, 'ngram_range': (1, 2),
            'preprocessor': text_prep, 'tokenizer': tk.tokenize, 'use_idf': False}

tfidf = TfidfVectorizer(**best_lsvc)
X2 = tfidf.fit_transform(dev["text"])
x2 = tfidf.transform(eva["text"])

tsvd = TruncatedSVD(n_components=500, random_state=20)
X2 = tsvd.fit_transform(X2)
x2 = tsvd.transform(x2)

In [None]:
X2 = np.hstack([X2, dnum])
x2 = np.hstack([x2, enum])

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

lsvc = LinearSVC(**{"C": 0.32, "class_weight": None, "dual": False, "max_iter": 5000, "random_state": 20})
lsvc.fit(X2, dev["sentiment"])

In [None]:
print(f1_score(true_labels, lsvc.predict(x2)))  # tunin 0.85218, testing 0.85202