In [None]:
import csv
import json
import matplotlib.pyplot as plt
import numpy as np
import nltk
# nltk.download("stopwords")
import pickle
import sklearn
import string
import wordcloud
import seaborn as sns
import operator

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
TRAIN_PATH = "development.jsonl"
TEST_PATH = "evaluation.jsonl"
SUBMISSION = "submission.csv"

class Tweet:
    def __init__(self, uid, text, retweet_count, favorite_count, is_positive=None):
        self.uid = str(uid)
        self.text = str(text)
        self.retweet_count = int(retweet_count)
        self.favorite_count = int(favorite_count)
        if is_positive is not None:
            self.is_positive = bool(is_positive)
        else:
            self.is_positive = None
    def __str__(self):
        return "{}".format(self.text)
    
# considerando ad esempio attributo "is_quoted"
#class Tweet:
#    def __init__(self, uid, text, retweet_count, favorite_count, is_quoted, is_positive=None):
#        self.uid = str(uid)
#        self.text = str(text)
#        self.retweet_count = int(retweet_count)
#        self.favorite_count = int(favorite_count)
#        self.is_quoted = str(is_quoted)
#        if is_positive is not None:
#            self.is_positive = bool(is_positive)
#        else:
#            self.is_positive = None
#    def __str__(self):
#        return "{}".format(self.text)

In [None]:
time_measures = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "second", "seconds", "minute", "minutes", "hour", "hours", "day", "days", "week", "weeks", "month", "months", "year", "years", "decade", "decades", "century", "centuries"]

verbs = ["i’ve", "i’m", "you’ve", "you’re", "he’s", "she’s", "it’s", "we’ve", "we’re", "they’re", "they’ve", "i’d", "you’d", "he’d", "she’d", "it’d", "we’d", "they’d", "i’ll", "you’ll", "he’ll", "she’ll", "it’ll", "we’ll", "they’ll", "don’t", "doesn’t", "didn’t", "i've", "i'm", "you've", "you're", "he's", "she's", "it's", "we've", "we're", "they're", "they've", "i'd", "you'd", "he'd", "she'd", "it'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "it'll", "we'll", "they'll", "don't", "doesn't", "didn't", "I'm", "I've", "I'd", "I’m", "I’ve", "I’d", "don’t"]

punctuations = ["!", "%", "&", '"', "(", ")", "*", "+", ",", ".", "...", "/", ":", ";", "<", "=", "?", "[", "]", "^", "_", "`", "{", "|", "}", "~"]

stopwords = ["ourselves", "hers", "between", "with", "again", "yourself", "but", "there", "about", "once", "during", "out", "very", "having", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "this", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]

stemmer = nltk.stem.porter.PorterStemmer()

def clean_tweet(tweet):
    # convert to lower case
    tweet = tweet.lower()
    
    # punctuation
    tweet = tweet.replace("\n", "")
    tweet = tweet.replace("\t", "")
    
    for pun in punctuations:
        tweet = tweet.replace(pun, " ")
    
    tweet = tweet.replace("-", "")
    tweet = tweet.replace("\u2026", " ")
    
    # remove time measures
    for times in time_measures:
        tweet = tweet.replace(times, "")
        
    # remove conjugate verbs
    for verb in verbs:
        new_tweet = []
        for token in tweet.split(" "):
            if token != verb.lower():
                new_tweet.append(token)
        tweet = " ".join(new_tweet)
    
    # remove stopwords
    #for word in nltk.corpus.stopwords.words("english"):
    for word in stopwords:
        new_tweet = []
        for token in tweet.split(" "):
            if token != word.lower():
                new_tweet.append(token)
        tweet = " ".join(new_tweet)
    
    new_tweet = []
    for token in tweet.split(" "):
        if not token.startswith("http") and not token.startswith("https") and "$" not in token and not token.isdigit() and not "am" in token and not "pm" in token:
            #new_tweet.append(token)
            new_tweet.append(stemmer.stem(token))
    tweet = " ".join(new_tweet)
    
    return tweet

# not token.startswith("@") and
#and not token.startswith("#")
#and not token == "rt"

def load_tweets(filename, train):
    result = {}
    with open(filename, "r") as fp:
        counter = 0
        for row in fp:
            tweet = None
            json_tweet = json.loads(row)
            if train:
                tweet = Tweet(uid=json_tweet["id_str"], text=clean_tweet(json_tweet["full_text"]), retweet_count=json_tweet["retweet_count"], favorite_count=json_tweet["favorite_count"], is_positive=json_tweet["class"])
                result[tweet.uid] = tweet
            else:
                tweet = Tweet(uid=json_tweet["id_str"], text=clean_tweet(json_tweet["full_text"]), retweet_count=json_tweet["retweet_count"], favorite_count=json_tweet["favorite_count"])
                result[counter] = tweet
            counter += 1
    return result

#if train:
#    tweet = Tweet(uid=json_tweet["id_str"], text=clean_tweet(json_tweet["full_text"]), retweet_count=json_tweet["retweet_count"], favorite_count=json_tweet["favorite_count"], is_quoted=json_tweet["quoted_status_id_str"], is_positive=json_tweet["class"])
#    result[tweet.uid] = tweet
#else:
#    tweet = Tweet(uid=json_tweet["id_str"], text=clean_tweet(json_tweet["full_text"]), retweet_count=json_tweet["retweet_count"], favorite_count=json_tweet["favorite_count"], is_quoted=json_tweet["quoted_status_id_str"])
#    result[counter] = tweet

In [None]:
all_tweets = load_tweets(TRAIN_PATH, True)
test_tweets = load_tweets(TEST_PATH, False)

labels = []
processed_features = []
processed_test_features = []
positive_tweets = []
negative_tweets = []
negative = 0
positive = 0

for key in all_tweets:
    if all_tweets[key].is_positive is True:
        labels.append(1)
        positive_tweets.append(all_tweets[key].text)
        positive += 1
    else:
        labels.append(0)
        negative_tweets.append(all_tweets[key].text)
        negative += 1
    #string = all_tweets[key].text
    #if all_tweets[key].is_quoted != "None":
    #    string = all_tweets[key].text+" quoted"
    #processed_features.append(string)
    processed_features.append(all_tweets[key].text)

for key in test_tweets:
    #string = test_tweets[key].text
    #if test_tweets[key].is_quoted != "None":
    #    string = test_tweets[key].text+" quoted"
    #processed_test_features.append(string)
    processed_test_features.append(test_tweets[key].text)

print(len(all_tweets))
print(len(test_tweets))
print(positive)
print(negative)

In [None]:
#percentuali = [pos_percentuale, neg_percentuale]
valori = [positive, negative]
lab = "Positive class", "Negative class"
plt.title("Label distribution")
plt.pie(valori, labels=lab, autopct='%1.2f%%')
plt.show()

In [None]:
#processed_features

In [None]:
word_occurrencies = {}

class WordFreq:

    def __init__(self, word, number_occ, positive, negative):
        self.word = word
        self.number_occ = number_occ
        self.positive = positive
        self.negative = negative

for key in all_tweets:
    for word in all_tweets[key].text.split():
        if word not in word_occurrencies:
            if all_tweets[key].is_positive:
                word_occurrencies[word] = WordFreq(word, 1, 1, 0)
            else:
                word_occurrencies[word] = WordFreq(word, 1, 0, 1)
        else:
            word_occurrencies[word].number_occ += 1
            if all_tweets[key].is_positive:
                word_occurrencies[word].positive += 1
            else:
                word_occurrencies[word].negative += 1

number_occ = 20
#number_occ = 50
counter = 0
words = []
top_N_count = []
top_N_positive = []
top_N_negative = []
word_occurrencies = sorted(word_occurrencies.values(), key=operator.attrgetter("number_occ"), reverse=True)
for wo in word_occurrencies:
    if counter <= number_occ:
        if counter != 0:
            print("Word {} Number {} Positive {} Negative {}".format(wo.word, wo.number_occ, wo.positive, wo.negative))
    words.append(wo.word)
    top_N_count.append(wo.number_occ)
    top_N_positive.append(wo.positive)
    top_N_negative.append(wo.negative)
    counter += 1

#tot_50_words = words[0:50]
#top_50_pos = top_N_positive[0:50]
#top_50_neg = top_N_negative[0:50]
#for i in range(50):
#    print(f"{tot_50_words[i]}, {top_50_pos[i]}, {top_50_neg[i]}")

tot_words = words[0:20]
top_N_positive = top_N_positive[0:20]
top_N_negative = top_N_negative[0:20]

# Top 20 words
plt.figure(figsize=(12, 8))
plt.title('TOP 20 WORDS', fontsize=18, pad=25.0, alpha=0.85, weight='normal')
x = np.arange(20)
width = 0.25
plt.bar(x-(width/2), (top_N_positive), width, linewidth=0.5, label='Positive class')
plt.bar(x+(width/2), (top_N_negative), width, linewidth=0.5, label='Negative class')
plt.legend()

#plt.bar(x-(width/2), (top_N_positive), width, color='#00800050', edgecolor='#00800095', linewidth=0.5, label='Positive class')
#plt.bar(x+(width/2), (top_N_negative), width, color='#80000050', edgecolor='#80000095', linewidth=0.5, label='Negative class')
plt.grid(alpha=0.2)
plt.xticks(x, tot_words, rotation=45, fontsize=12)
plt.ylabel('Occurrency', alpha=0.85, weight='normal', fontsize=14, labelpad=18.0)
plt.show()

In [None]:
word_occurrencies = {}

class WordFreqDiv:

    def __init__(self, word, number_occ):
        self.word = word
        self.number_occ = number_occ

for key in all_tweets:
    if all_tweets[key].is_positive:
        for word in all_tweets[key].text.split():
            if word not in word_occurrencies:
                word_occurrencies[word] = WordFreqDiv(word, 1)
            else:
                word_occurrencies[word].number_occ += 1

# Positive
number_occ = 20
counter = 0
positive_words = []
top_N_positive_count = []
word_occurrencies = sorted(word_occurrencies.values(), key=operator.attrgetter("number_occ"), reverse=True)
for wo in word_occurrencies:
    if counter < number_occ:
        print("Word {} Number {}".format(wo.word, wo.number_occ))
    positive_words.append(wo.word)
    top_N_positive_count.append(wo.number_occ)
    counter += 1

# Generate positive wordcloud
serialized_positive_tweets = " ".join(positive_tweets)
wordcloud = wordcloud.WordCloud(width=1200, height=800, background_color="white").generate(serialized_positive_tweets)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off")
plt.show()

In [None]:
word_occurrencies = {}

for key in all_tweets:
    if not all_tweets[key].is_positive:
        for word in all_tweets[key].text.split():
            if word not in word_occurrencies:
                word_occurrencies[word] = WordFreqDiv(word, 1)
            else:
                word_occurrencies[word].number_occ += 1

# Negative
number_occ = 20
counter = 0
negative_words = []
top_N_negative_count = []
word_occurrencies = sorted(word_occurrencies.values(), key=operator.attrgetter("number_occ"), reverse=True)
for wo in word_occurrencies:
    if counter < number_occ:
        print("Word {} Number {}".format(wo.word, wo.number_occ))
    negative_words.append(wo.word)
    top_N_negative_count.append(wo.number_occ)
    counter += 1

import wordcloud
# Generate negative wordcloud
serialized_negative_tweets = " ".join(negative_tweets)
wordcloud = wordcloud.WordCloud(width=1200, height=800, background_color="black").generate(serialized_negative_tweets)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off")
plt.show()

In [None]:
# Plot Top 20 positive words
positive_words = positive_words[0:20]
top_N_positive_count = top_N_positive_count[0:20]
x = np.arange(20)
plt.figure(figsize=(12, 8))
plt.title('TOP 20 POSITIVE WORDS', fontsize=18, pad=25.0, alpha=0.85, weight='normal')
plt.grid(alpha=0.2)
plt.bar(x, width=0.8, height=top_N_positive_count, label="Positive class")
plt.legend()
plt.xticks(x, positive_words, rotation=45, fontsize=12)
plt.ylabel('Occurrency', alpha=0.85, weight='normal', fontsize=14, labelpad=18.0)
plt.show()

In [None]:
# Plot Top 20 negative words
negative_words = negative_words[0:20]
top_N_negative_count = top_N_negative_count[0:20]
x = np.arange(20)
plt.figure(figsize=(12, 8))
plt.title('TOP 20 NEGATIVE WORDS', fontsize=18, pad=25.0, alpha=0.85, weight='normal')
plt.grid(alpha=0.2)
plt.bar(x, width=0.8, height=top_N_negative_count)
plt.bar(x, width=0.8, height=top_N_negative_count, label="Negative class")
plt.legend()
plt.xticks(x, negative_words, rotation=45, fontsize=12)
plt.ylabel('Occurrency', alpha=0.85, weight='normal', fontsize=14, labelpad=18.0)
plt.show()

In [None]:
# VECTORIZER

LOWER_BOUND = 1
UPPER_BOUND = 3

vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(LOWER_BOUND, UPPER_BOUND))
#vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(LOWER_BOUND, UPPER_BOUND), max_features=250000)
vectorizer.fit(processed_features)

In [None]:
#len(vectorizer.get_feature_names())

In [None]:
# TRY DIFFERENT CLASSIFIERS
"""
ESTIMATORS = 100
KERNEL = "linear"
NEIGHBORS = 5

models = []
models_name = ["RandomForestClassifier", "LogisticRegression", "KNeighborsClassifier", "SVC", "MultinomialNB"]

model = RandomForestClassifier(n_jobs=-1, n_estimators=ESTIMATORS, random_state=0)
models.append(model)
model = LogisticRegression(n_jobs=-1, random_state=0)
models.append(model)
model = KNeighborsClassifier(n_neighbors=NEIGHBORS, n_jobs=-1)
models.append(model)
model = SVC(kernel=KERNEL)
models.append(model)
model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
models.append(model)"""

In [None]:
# TRY DIFFERENT CLASSIFIERS
"""
best_accuracy = 0
best_i = -1
for i in range(0, 5):
    print("Training model {}".format(models_name[i]))
    my_pipeline = make_pipeline(vectorizer, models[i])
    accuracy_cv = cross_val_score(my_pipeline, processed_features, labels, cv=10, scoring="accuracy")
    mean = accuracy_cv.mean()
    std = accuracy_cv.std() ** 2
    print(f"Accuracy (statistics): {mean:.6f} (+/- {std:.6f})")
    model_accuracy = mean
    if model_accuracy > best_accuracy:
        best_accuracy = model_accuracy
        print("Best model {}".format(models_name[i]))
        best_i = i
model = models[best_i]"""

In [None]:
# TUNING HYPERPARAMETERS BEST MODEL (LOGISTICREGRESSION)

"""cs = [0.5, 1, 1.5, 2, 2.5]
penalties = ["l1", "l2"]
solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]

for c in cs:
    for penalty in penalties:
        for solver in solvers:
            if (solver == "newton-cg" or solver == "lbfgs" or solver == "sag") and penalty not in ["l2", "none"]:
                continue
            print("\nC: {} Penality: {} Solver: {}".format(c, penalty, solver))
            model = LogisticRegression(solver=solver, penalty=penalty, C=c, n_jobs=-1, random_state=0)
            my_pipeline = make_pipeline(vectorizer, model)
            predictions = cross_val_predict(my_pipeline, processed_features, labels, cv=10)
            print(sklearn.metrics.classification_report(labels, predictions))
            print("Precision score: {}".format(sklearn.metrics.precision_score(labels, predictions)))
            print("Accuracy score: {}".format(sklearn.metrics.accuracy_score(labels, predictions)))"""

In [None]:
"""cs = [0.5, 1, 1.5, 2, 2.5]
penalties = ["elasticnet", "none"]
solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]

for c in cs:
    for penalty in penalties:
        for solver in solvers:
            if (solver == "newton-cg" or solver == "lbfgs" or solver == "sag") and penalty not in ["l2", "none"]:
                continue
            if penalty == "elasticnet" and solver != "saga":
                continue
            if penalty == "none" and solver == "liblinear":
                continue
            print("C: {} Penality: {} Solver: {}".format(c, penalty, solver))
            if penalty == "elasticnet":
                # l1_ratio 0.5, so combination of penalty "l1" and "l2"
                # l1_ratio=0 is equivalent to set penalty="l2"
                # l1_ratio=1 is equivalent to set penalty="l1"
                model = LogisticRegression(solver=solver, penalty=penalty, C=c, l1_ratio=0.5, n_jobs=-1, random_state=0)
                my_pipeline = make_pipeline(vectorizer, model)
            else:
                model = LogisticRegression(solver=solver, penalty=penalty, C=c, n_jobs=-1, random_state=0)
                my_pipeline = make_pipeline(vectorizer, model)
            predictions = cross_val_predict(my_pipeline, processed_features, labels, cv=10)
            print(sklearn.metrics.classification_report(labels, predictions))
            print("Precision score: {}".format(sklearn.metrics.precision_score(labels, predictions)))
            print("Accuracy score: {}".format(sklearn.metrics.accuracy_score(labels, predictions)))"""

In [None]:
# BEST HYPERPARAMETERS FOUND
solver = "lbfgs"
penalty = "l2"
c = 1.5

model = LogisticRegression(solver=solver, penalty=penalty, C=c, n_jobs=-1, random_state=0)
#model = LinearSVC(loss="hinge", C=2.0, random_state=0)
my_pipeline = make_pipeline(vectorizer, model)

accuracy_cv = cross_val_score(my_pipeline, processed_features, labels, cv=10, scoring="accuracy")
print("Score accuracy: {}".format(accuracy_cv))
print("Mean accuracy: {}".format(accuracy_cv.mean()))
mean = accuracy_cv.mean()
std = accuracy_cv.std() ** 2
print(f"Accuracy (statistics): {mean:.3f} (+/- {std:.6f})")

# predictions (training)
predictions = cross_val_predict(my_pipeline, processed_features, labels, cv=10)

# Confusion matrix
cm = sklearn.metrics.confusion_matrix(labels, predictions)
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, cmap="GnBu", fmt='g')
ax.set_title("Confusion Matrix")

print(sklearn.metrics.classification_report(labels, predictions))
print("Precision score: {}".format(sklearn.metrics.precision_score(labels, predictions)))
print("Accuracy score: {}".format(sklearn.metrics.accuracy_score(labels, predictions)))

In [None]:
# predictions (test)
my_pipeline.fit(processed_features, labels)
predicted_values = my_pipeline.predict(processed_test_features)

In [None]:
header = "Id,Predicted"
first = True
counter = 0

with open(SUBMISSION, "w") as file_out:
    for predicted in predicted_values:
        if first:
            file_out.write("{}\n{},{}\n".format(header, counter, predicted))
            first = False
        else:
            file_out.write("{},{}\n".format(counter, predicted))
        counter += 1